diff --git a/.gitattributes b/.gitattributes index f62f34a441eade728639673c956e9eaef30b0fe2..bacca141e0c9b4e21f009985ba9a5ec4a01d016a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1502,3 +1502,12 @@ Qwen2-7B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r Qwen2-7B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-10000/checkpoint-17010/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-10000/checkpoint-3402/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-10000/checkpoint-6804/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8ddb940b0065fc9fadb6bb28fc3172c7816d2bb6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b956960a1e2893d33dab26c459e8cb4362cd9ee70a757e4df6520cceecb170 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8ddb940b0065fc9fadb6bb28fc3172c7816d2bb6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b956960a1e2893d33dab26c459e8cb4362cd9ee70a757e4df6520cceecb170 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef41c3a5d8b84a1e93fe50ff77f6dd180daef579 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef0195d6993677f23bfb8540922d65d9e9bfe4b8f9f429b021a4155ddf3982b2 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..419ce2e6f8813d473d33c30e2289cd337bbcfdb7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99ba495d4ab1840ee9951582792a050f2fe3179785b4b883e346422c9a730243 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9616b7af71ef16a67656b667329e9436cede01cb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3579e02da786784d52866436542b8c8b23ad5e62fdfb037e8ab0bbef797c6d56 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e19332d82f126194bd6bc202928ca916cbbc9ac2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/trainer_state.json @@ -0,0 +1,1302 @@ +{ + "best_metric": 1.8116765022277832, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1794, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + }, + { + "epoch": 1.0033444816053512, + "grad_norm": 0.29757317900657654, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 900 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.32168492674827576, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 910 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.3430717885494232, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 920 + }, + { + "epoch": 1.0367892976588629, + "grad_norm": 0.3431745767593384, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 930 + }, + { + "epoch": 1.0479375696767002, + "grad_norm": 0.39787548780441284, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 1.0590858416945372, + "grad_norm": 0.3540935218334198, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 950 + }, + { + "epoch": 1.0702341137123745, + "grad_norm": 0.368484765291214, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 960 + }, + { + "epoch": 1.0813823857302118, + "grad_norm": 0.41324466466903687, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 970 + }, + { + "epoch": 1.0925306577480491, + "grad_norm": 0.3696419596672058, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 980 + }, + { + "epoch": 1.1036789297658862, + "grad_norm": 0.33832886815071106, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 990 + }, + { + "epoch": 1.1148272017837235, + "grad_norm": 0.4411991834640503, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1000 + }, + { + "epoch": 1.1259754738015608, + "grad_norm": 0.3935333788394928, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 1010 + }, + { + "epoch": 1.137123745819398, + "grad_norm": 0.32472893595695496, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1020 + }, + { + "epoch": 1.1482720178372352, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 1030 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.3995654582977295, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 1040 + }, + { + "epoch": 1.1705685618729098, + "grad_norm": 0.384056031703949, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1050 + }, + { + "epoch": 1.1817168338907469, + "grad_norm": 0.4345705211162567, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1060 + }, + { + "epoch": 1.1928651059085842, + "grad_norm": 0.3524057865142822, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1070 + }, + { + "epoch": 1.2040133779264215, + "grad_norm": 0.4047132134437561, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 1080 + }, + { + "epoch": 1.2151616499442586, + "grad_norm": 0.365824431180954, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 1090 + }, + { + "epoch": 1.2263099219620959, + "grad_norm": 0.37048354744911194, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1100 + }, + { + "epoch": 1.2374581939799332, + "grad_norm": 0.3753672242164612, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 1110 + }, + { + "epoch": 1.2486064659977703, + "grad_norm": 0.37887042760849, + "learning_rate": 0.0002, + "loss": 1.6984, + "step": 1120 + }, + { + "epoch": 1.2597547380156076, + "grad_norm": 0.3896579444408417, + "learning_rate": 0.0002, + "loss": 1.7866, + "step": 1130 + }, + { + "epoch": 1.2709030100334449, + "grad_norm": 0.3725394010543823, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 1140 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.373989999294281, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1150 + }, + { + "epoch": 1.2931995540691192, + "grad_norm": 0.4412260353565216, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1160 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.38538658618927, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1170 + }, + { + "epoch": 1.3154960981047936, + "grad_norm": 0.3644104599952698, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1180 + }, + { + "epoch": 1.326644370122631, + "grad_norm": 0.3615347743034363, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1190 + }, + { + "epoch": 1.3377926421404682, + "grad_norm": 0.4260489046573639, + "learning_rate": 0.0002, + "loss": 1.7575, + "step": 1200 + }, + { + "epoch": 1.3489409141583055, + "grad_norm": 0.35236871242523193, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1210 + }, + { + "epoch": 1.3600891861761428, + "grad_norm": 0.45456627011299133, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1220 + }, + { + "epoch": 1.37123745819398, + "grad_norm": 0.391541063785553, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 1230 + }, + { + "epoch": 1.3823857302118172, + "grad_norm": 0.37955328822135925, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1240 + }, + { + "epoch": 1.3935340022296545, + "grad_norm": 0.36955225467681885, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 1250 + }, + { + "epoch": 1.4046822742474916, + "grad_norm": 0.36156216263771057, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 1260 + }, + { + "epoch": 1.415830546265329, + "grad_norm": 0.4083487391471863, + "learning_rate": 0.0002, + "loss": 1.8091, + "step": 1270 + }, + { + "epoch": 1.4269788182831662, + "grad_norm": 0.420171320438385, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1280 + }, + { + "epoch": 1.4381270903010033, + "grad_norm": 0.3581725060939789, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1290 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.3657953441143036, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1300 + }, + { + "epoch": 1.4604236343366779, + "grad_norm": 0.3139931857585907, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 1310 + }, + { + "epoch": 1.471571906354515, + "grad_norm": 0.37750574946403503, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 1320 + }, + { + "epoch": 1.4827201783723523, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1330 + }, + { + "epoch": 1.4938684503901896, + "grad_norm": 0.39505279064178467, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1340 + }, + { + "epoch": 1.5050167224080266, + "grad_norm": 0.39977672696113586, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 1350 + }, + { + "epoch": 1.516164994425864, + "grad_norm": 0.4395383298397064, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 1.5273132664437012, + "grad_norm": 0.3452998995780945, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 1370 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.39573904871940613, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1380 + }, + { + "epoch": 1.5496098104793758, + "grad_norm": 0.4886358976364136, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1390 + }, + { + "epoch": 1.560758082497213, + "grad_norm": 0.35525891184806824, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 1400 + }, + { + "epoch": 1.57190635451505, + "grad_norm": 0.3873274028301239, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1410 + }, + { + "epoch": 1.5830546265328875, + "grad_norm": 0.35162487626075745, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 1420 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.3533175587654114, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 1430 + }, + { + "epoch": 1.605351170568562, + "grad_norm": 0.35397887229919434, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 1440 + }, + { + "epoch": 1.6164994425863992, + "grad_norm": 0.3539091646671295, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 1450 + }, + { + "epoch": 1.6276477146042363, + "grad_norm": 0.38557013869285583, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 1460 + }, + { + "epoch": 1.6387959866220736, + "grad_norm": 0.3591409921646118, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1470 + }, + { + "epoch": 1.649944258639911, + "grad_norm": 0.3776722848415375, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1480 + }, + { + "epoch": 1.661092530657748, + "grad_norm": 0.3761521875858307, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1490 + }, + { + "epoch": 1.6722408026755853, + "grad_norm": 0.33939364552497864, + "learning_rate": 0.0002, + "loss": 1.7464, + "step": 1500 + }, + { + "epoch": 1.6833890746934226, + "grad_norm": 0.3961067795753479, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 1510 + }, + { + "epoch": 1.6945373467112597, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1520 + }, + { + "epoch": 1.705685618729097, + "grad_norm": 0.4201025068759918, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1530 + }, + { + "epoch": 1.7168338907469343, + "grad_norm": 0.382280558347702, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 1540 + }, + { + "epoch": 1.7279821627647713, + "grad_norm": 0.4504372477531433, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 1550 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.36121585965156555, + "learning_rate": 0.0002, + "loss": 1.7889, + "step": 1560 + }, + { + "epoch": 1.750278706800446, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 1570 + }, + { + "epoch": 1.761426978818283, + "grad_norm": 0.3920411467552185, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 1580 + }, + { + "epoch": 1.7725752508361206, + "grad_norm": 0.4326777756214142, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1590 + }, + { + "epoch": 1.7837235228539576, + "grad_norm": 0.3582489490509033, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1600 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.36345767974853516, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 1610 + }, + { + "epoch": 1.8060200668896322, + "grad_norm": 0.3951990008354187, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1620 + }, + { + "epoch": 1.8171683389074693, + "grad_norm": 0.35174235701560974, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 1630 + }, + { + "epoch": 1.8283166109253066, + "grad_norm": 0.37005263566970825, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1640 + }, + { + "epoch": 1.839464882943144, + "grad_norm": 0.42875173687934875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 1650 + }, + { + "epoch": 1.850613154960981, + "grad_norm": 0.3646032512187958, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1660 + }, + { + "epoch": 1.8617614269788183, + "grad_norm": 0.38111618161201477, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1670 + }, + { + "epoch": 1.8729096989966556, + "grad_norm": 0.3825555443763733, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1680 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.36418095231056213, + "learning_rate": 0.0002, + "loss": 1.7599, + "step": 1690 + }, + { + "epoch": 1.89520624303233, + "grad_norm": 0.36551007628440857, + "learning_rate": 0.0002, + "loss": 1.6532, + "step": 1700 + }, + { + "epoch": 1.9063545150501673, + "grad_norm": 0.36421480774879456, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1710 + }, + { + "epoch": 1.9175027870680044, + "grad_norm": 0.3791242241859436, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 1720 + }, + { + "epoch": 1.9286510590858417, + "grad_norm": 0.36655193567276, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1730 + }, + { + "epoch": 1.939799331103679, + "grad_norm": 0.3526945412158966, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 1740 + }, + { + "epoch": 1.950947603121516, + "grad_norm": 0.41139861941337585, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 1750 + }, + { + "epoch": 1.9620958751393534, + "grad_norm": 0.41757065057754517, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1760 + }, + { + "epoch": 1.9732441471571907, + "grad_norm": 0.38956186175346375, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1770 + }, + { + "epoch": 1.9843924191750277, + "grad_norm": 0.33891627192497253, + "learning_rate": 0.0002, + "loss": 1.7653, + "step": 1780 + }, + { + "epoch": 1.9955406911928653, + "grad_norm": 0.42879191040992737, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 1790 + }, + { + "epoch": 2.0, + "eval_loss": 1.8116765022277832, + "eval_runtime": 37.9859, + "eval_samples_per_second": 13.558, + "eval_steps_per_second": 1.711, + "step": 1794 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.302230307130573e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..78b7214dadec31bf9f176bd54452b7d94bcad14b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad384512f97772513a3b27b6f16370758c7f7314b2d95f77424ec2b4e9534d66 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c121f928faec8684b6759632ffb9ff8f164645c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ead93fa5c6608b0024a5a80847971872fdae7e937ae56275a32b6f60477ed7 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..539ecd9316de02f755611c66765a175370d75687 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea28d69f6858776bf742fb9fad0f3e9e1bc5ee18ee5b1410fefb5ea032b616e1 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..91ef42833b44e81d34de68070e40095936457083 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45e2a9aea793b35544a02d328f61205ab3c8f3374fd12d3ff93303f524691bc7 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..047bd0dfc5b4ad6a5be156dccffc2980e5646745 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/trainer_state.json @@ -0,0 +1,1940 @@ +{ + "best_metric": 1.8116765022277832, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 2691, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + }, + { + "epoch": 1.0033444816053512, + "grad_norm": 0.29757317900657654, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 900 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.32168492674827576, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 910 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.3430717885494232, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 920 + }, + { + "epoch": 1.0367892976588629, + "grad_norm": 0.3431745767593384, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 930 + }, + { + "epoch": 1.0479375696767002, + "grad_norm": 0.39787548780441284, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 1.0590858416945372, + "grad_norm": 0.3540935218334198, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 950 + }, + { + "epoch": 1.0702341137123745, + "grad_norm": 0.368484765291214, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 960 + }, + { + "epoch": 1.0813823857302118, + "grad_norm": 0.41324466466903687, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 970 + }, + { + "epoch": 1.0925306577480491, + "grad_norm": 0.3696419596672058, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 980 + }, + { + "epoch": 1.1036789297658862, + "grad_norm": 0.33832886815071106, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 990 + }, + { + "epoch": 1.1148272017837235, + "grad_norm": 0.4411991834640503, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1000 + }, + { + "epoch": 1.1259754738015608, + "grad_norm": 0.3935333788394928, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 1010 + }, + { + "epoch": 1.137123745819398, + "grad_norm": 0.32472893595695496, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1020 + }, + { + "epoch": 1.1482720178372352, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 1030 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.3995654582977295, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 1040 + }, + { + "epoch": 1.1705685618729098, + "grad_norm": 0.384056031703949, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1050 + }, + { + "epoch": 1.1817168338907469, + "grad_norm": 0.4345705211162567, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1060 + }, + { + "epoch": 1.1928651059085842, + "grad_norm": 0.3524057865142822, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1070 + }, + { + "epoch": 1.2040133779264215, + "grad_norm": 0.4047132134437561, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 1080 + }, + { + "epoch": 1.2151616499442586, + "grad_norm": 0.365824431180954, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 1090 + }, + { + "epoch": 1.2263099219620959, + "grad_norm": 0.37048354744911194, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1100 + }, + { + "epoch": 1.2374581939799332, + "grad_norm": 0.3753672242164612, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 1110 + }, + { + "epoch": 1.2486064659977703, + "grad_norm": 0.37887042760849, + "learning_rate": 0.0002, + "loss": 1.6984, + "step": 1120 + }, + { + "epoch": 1.2597547380156076, + "grad_norm": 0.3896579444408417, + "learning_rate": 0.0002, + "loss": 1.7866, + "step": 1130 + }, + { + "epoch": 1.2709030100334449, + "grad_norm": 0.3725394010543823, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 1140 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.373989999294281, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1150 + }, + { + "epoch": 1.2931995540691192, + "grad_norm": 0.4412260353565216, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1160 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.38538658618927, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1170 + }, + { + "epoch": 1.3154960981047936, + "grad_norm": 0.3644104599952698, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1180 + }, + { + "epoch": 1.326644370122631, + "grad_norm": 0.3615347743034363, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1190 + }, + { + "epoch": 1.3377926421404682, + "grad_norm": 0.4260489046573639, + "learning_rate": 0.0002, + "loss": 1.7575, + "step": 1200 + }, + { + "epoch": 1.3489409141583055, + "grad_norm": 0.35236871242523193, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1210 + }, + { + "epoch": 1.3600891861761428, + "grad_norm": 0.45456627011299133, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1220 + }, + { + "epoch": 1.37123745819398, + "grad_norm": 0.391541063785553, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 1230 + }, + { + "epoch": 1.3823857302118172, + "grad_norm": 0.37955328822135925, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1240 + }, + { + "epoch": 1.3935340022296545, + "grad_norm": 0.36955225467681885, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 1250 + }, + { + "epoch": 1.4046822742474916, + "grad_norm": 0.36156216263771057, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 1260 + }, + { + "epoch": 1.415830546265329, + "grad_norm": 0.4083487391471863, + "learning_rate": 0.0002, + "loss": 1.8091, + "step": 1270 + }, + { + "epoch": 1.4269788182831662, + "grad_norm": 0.420171320438385, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1280 + }, + { + "epoch": 1.4381270903010033, + "grad_norm": 0.3581725060939789, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1290 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.3657953441143036, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1300 + }, + { + "epoch": 1.4604236343366779, + "grad_norm": 0.3139931857585907, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 1310 + }, + { + "epoch": 1.471571906354515, + "grad_norm": 0.37750574946403503, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 1320 + }, + { + "epoch": 1.4827201783723523, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1330 + }, + { + "epoch": 1.4938684503901896, + "grad_norm": 0.39505279064178467, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1340 + }, + { + "epoch": 1.5050167224080266, + "grad_norm": 0.39977672696113586, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 1350 + }, + { + "epoch": 1.516164994425864, + "grad_norm": 0.4395383298397064, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 1.5273132664437012, + "grad_norm": 0.3452998995780945, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 1370 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.39573904871940613, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1380 + }, + { + "epoch": 1.5496098104793758, + "grad_norm": 0.4886358976364136, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1390 + }, + { + "epoch": 1.560758082497213, + "grad_norm": 0.35525891184806824, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 1400 + }, + { + "epoch": 1.57190635451505, + "grad_norm": 0.3873274028301239, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1410 + }, + { + "epoch": 1.5830546265328875, + "grad_norm": 0.35162487626075745, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 1420 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.3533175587654114, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 1430 + }, + { + "epoch": 1.605351170568562, + "grad_norm": 0.35397887229919434, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 1440 + }, + { + "epoch": 1.6164994425863992, + "grad_norm": 0.3539091646671295, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 1450 + }, + { + "epoch": 1.6276477146042363, + "grad_norm": 0.38557013869285583, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 1460 + }, + { + "epoch": 1.6387959866220736, + "grad_norm": 0.3591409921646118, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1470 + }, + { + "epoch": 1.649944258639911, + "grad_norm": 0.3776722848415375, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1480 + }, + { + "epoch": 1.661092530657748, + "grad_norm": 0.3761521875858307, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1490 + }, + { + "epoch": 1.6722408026755853, + "grad_norm": 0.33939364552497864, + "learning_rate": 0.0002, + "loss": 1.7464, + "step": 1500 + }, + { + "epoch": 1.6833890746934226, + "grad_norm": 0.3961067795753479, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 1510 + }, + { + "epoch": 1.6945373467112597, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1520 + }, + { + "epoch": 1.705685618729097, + "grad_norm": 0.4201025068759918, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1530 + }, + { + "epoch": 1.7168338907469343, + "grad_norm": 0.382280558347702, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 1540 + }, + { + "epoch": 1.7279821627647713, + "grad_norm": 0.4504372477531433, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 1550 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.36121585965156555, + "learning_rate": 0.0002, + "loss": 1.7889, + "step": 1560 + }, + { + "epoch": 1.750278706800446, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 1570 + }, + { + "epoch": 1.761426978818283, + "grad_norm": 0.3920411467552185, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 1580 + }, + { + "epoch": 1.7725752508361206, + "grad_norm": 0.4326777756214142, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1590 + }, + { + "epoch": 1.7837235228539576, + "grad_norm": 0.3582489490509033, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1600 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.36345767974853516, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 1610 + }, + { + "epoch": 1.8060200668896322, + "grad_norm": 0.3951990008354187, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1620 + }, + { + "epoch": 1.8171683389074693, + "grad_norm": 0.35174235701560974, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 1630 + }, + { + "epoch": 1.8283166109253066, + "grad_norm": 0.37005263566970825, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1640 + }, + { + "epoch": 1.839464882943144, + "grad_norm": 0.42875173687934875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 1650 + }, + { + "epoch": 1.850613154960981, + "grad_norm": 0.3646032512187958, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1660 + }, + { + "epoch": 1.8617614269788183, + "grad_norm": 0.38111618161201477, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1670 + }, + { + "epoch": 1.8729096989966556, + "grad_norm": 0.3825555443763733, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1680 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.36418095231056213, + "learning_rate": 0.0002, + "loss": 1.7599, + "step": 1690 + }, + { + "epoch": 1.89520624303233, + "grad_norm": 0.36551007628440857, + "learning_rate": 0.0002, + "loss": 1.6532, + "step": 1700 + }, + { + "epoch": 1.9063545150501673, + "grad_norm": 0.36421480774879456, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1710 + }, + { + "epoch": 1.9175027870680044, + "grad_norm": 0.3791242241859436, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 1720 + }, + { + "epoch": 1.9286510590858417, + "grad_norm": 0.36655193567276, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1730 + }, + { + "epoch": 1.939799331103679, + "grad_norm": 0.3526945412158966, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 1740 + }, + { + "epoch": 1.950947603121516, + "grad_norm": 0.41139861941337585, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 1750 + }, + { + "epoch": 1.9620958751393534, + "grad_norm": 0.41757065057754517, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1760 + }, + { + "epoch": 1.9732441471571907, + "grad_norm": 0.38956186175346375, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1770 + }, + { + "epoch": 1.9843924191750277, + "grad_norm": 0.33891627192497253, + "learning_rate": 0.0002, + "loss": 1.7653, + "step": 1780 + }, + { + "epoch": 1.9955406911928653, + "grad_norm": 0.42879191040992737, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 1790 + }, + { + "epoch": 2.0, + "eval_loss": 1.8116765022277832, + "eval_runtime": 37.9859, + "eval_samples_per_second": 13.558, + "eval_steps_per_second": 1.711, + "step": 1794 + }, + { + "epoch": 2.0066889632107023, + "grad_norm": 0.42103368043899536, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 1800 + }, + { + "epoch": 2.0178372352285394, + "grad_norm": 0.41505053639411926, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1810 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.398190438747406, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1820 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.4371621310710907, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1830 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.45679208636283875, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1840 + }, + { + "epoch": 2.0624303232998886, + "grad_norm": 0.43211811780929565, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 1850 + }, + { + "epoch": 2.0735785953177257, + "grad_norm": 0.47492915391921997, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1860 + }, + { + "epoch": 2.084726867335563, + "grad_norm": 0.41742339730262756, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1870 + }, + { + "epoch": 2.0958751393534003, + "grad_norm": 0.45789217948913574, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1880 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.43958935141563416, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1890 + }, + { + "epoch": 2.1181716833890745, + "grad_norm": 0.43991968035697937, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1900 + }, + { + "epoch": 2.129319955406912, + "grad_norm": 0.4667953848838806, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 1910 + }, + { + "epoch": 2.140468227424749, + "grad_norm": 0.42225760221481323, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1920 + }, + { + "epoch": 2.1516164994425866, + "grad_norm": 0.418850839138031, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1930 + }, + { + "epoch": 2.1627647714604237, + "grad_norm": 0.43838515877723694, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1940 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.43798115849494934, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 1950 + }, + { + "epoch": 2.1850613154960983, + "grad_norm": 0.4456610679626465, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1960 + }, + { + "epoch": 2.1962095875139354, + "grad_norm": 0.4619026482105255, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 1970 + }, + { + "epoch": 2.2073578595317724, + "grad_norm": 0.4732453525066376, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1980 + }, + { + "epoch": 2.21850613154961, + "grad_norm": 0.42551836371421814, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1990 + }, + { + "epoch": 2.229654403567447, + "grad_norm": 0.45154353976249695, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2000 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.4655696451663971, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 2010 + }, + { + "epoch": 2.2519509476031216, + "grad_norm": 0.5363447666168213, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 2020 + }, + { + "epoch": 2.2630992196209587, + "grad_norm": 0.4839927852153778, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2030 + }, + { + "epoch": 2.274247491638796, + "grad_norm": 0.4639221727848053, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 2040 + }, + { + "epoch": 2.2853957636566333, + "grad_norm": 0.46169278025627136, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2050 + }, + { + "epoch": 2.2965440356744704, + "grad_norm": 0.4582304060459137, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 2060 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.48619818687438965, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2070 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.4382200241088867, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 2080 + }, + { + "epoch": 2.329988851727982, + "grad_norm": 0.4103265106678009, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 2090 + }, + { + "epoch": 2.3411371237458196, + "grad_norm": 0.5136023759841919, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2100 + }, + { + "epoch": 2.3522853957636567, + "grad_norm": 0.46723702549934387, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 2110 + }, + { + "epoch": 2.3634336677814938, + "grad_norm": 0.42269468307495117, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 2120 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 0.42611163854599, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2130 + }, + { + "epoch": 2.3857302118171684, + "grad_norm": 0.4573901891708374, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 2140 + }, + { + "epoch": 2.3968784838350055, + "grad_norm": 0.4758673310279846, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 2150 + }, + { + "epoch": 2.408026755852843, + "grad_norm": 0.49616846442222595, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2160 + }, + { + "epoch": 2.41917502787068, + "grad_norm": 0.5278240442276001, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2170 + }, + { + "epoch": 2.430323299888517, + "grad_norm": 0.46806028485298157, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 2180 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 0.44507312774658203, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 2190 + }, + { + "epoch": 2.4526198439241917, + "grad_norm": 0.45716050267219543, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 2200 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.4226573705673218, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2210 + }, + { + "epoch": 2.4749163879598663, + "grad_norm": 0.4488418400287628, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2220 + }, + { + "epoch": 2.4860646599777034, + "grad_norm": 0.48324450850486755, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 2230 + }, + { + "epoch": 2.4972129319955405, + "grad_norm": 0.4866982400417328, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2240 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.4784172773361206, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 2250 + }, + { + "epoch": 2.519509476031215, + "grad_norm": 0.4250621199607849, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 2260 + }, + { + "epoch": 2.5306577480490526, + "grad_norm": 0.431224524974823, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 2270 + }, + { + "epoch": 2.5418060200668897, + "grad_norm": 0.3931371867656708, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2280 + }, + { + "epoch": 2.552954292084727, + "grad_norm": 0.4800887703895569, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 2290 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.4288487136363983, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2300 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.48489660024642944, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2310 + }, + { + "epoch": 2.5863991081382385, + "grad_norm": 0.4221740961074829, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 2320 + }, + { + "epoch": 2.597547380156076, + "grad_norm": 0.4413852393627167, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 2330 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.4391345679759979, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2340 + }, + { + "epoch": 2.61984392419175, + "grad_norm": 0.4824720323085785, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 2350 + }, + { + "epoch": 2.6309921962095872, + "grad_norm": 0.4023158550262451, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 2360 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 0.5107841491699219, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 2370 + }, + { + "epoch": 2.653288740245262, + "grad_norm": 0.4705312252044678, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 2380 + }, + { + "epoch": 2.6644370122630994, + "grad_norm": 0.4420899450778961, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 2390 + }, + { + "epoch": 2.6755852842809364, + "grad_norm": 0.413308709859848, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2400 + }, + { + "epoch": 2.6867335562987735, + "grad_norm": 0.4312658905982971, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2410 + }, + { + "epoch": 2.697881828316611, + "grad_norm": 0.44714513421058655, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2420 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.49152931571006775, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 2430 + }, + { + "epoch": 2.7201783723522857, + "grad_norm": 0.49458765983581543, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 2440 + }, + { + "epoch": 2.7313266443701227, + "grad_norm": 0.47838348150253296, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 2450 + }, + { + "epoch": 2.74247491638796, + "grad_norm": 0.5781240463256836, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 2460 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.4559851884841919, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2470 + }, + { + "epoch": 2.7647714604236344, + "grad_norm": 0.4452647566795349, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 2480 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.43920454382896423, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2490 + }, + { + "epoch": 2.787068004459309, + "grad_norm": 0.467780739068985, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 2500 + }, + { + "epoch": 2.798216276477146, + "grad_norm": 0.4743262529373169, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2510 + }, + { + "epoch": 2.809364548494983, + "grad_norm": 0.47944432497024536, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2520 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.48032790422439575, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 2530 + }, + { + "epoch": 2.831661092530658, + "grad_norm": 0.45569729804992676, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 2540 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.47940587997436523, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2550 + }, + { + "epoch": 2.8539576365663324, + "grad_norm": 0.5215432047843933, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 2560 + }, + { + "epoch": 2.8651059085841695, + "grad_norm": 0.4421178102493286, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 2570 + }, + { + "epoch": 2.8762541806020065, + "grad_norm": 0.45288747549057007, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 2580 + }, + { + "epoch": 2.887402452619844, + "grad_norm": 0.4472251832485199, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2590 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.4396503269672394, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2600 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.48590990900993347, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 2610 + }, + { + "epoch": 2.9208472686733558, + "grad_norm": 0.4787760376930237, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 2620 + }, + { + "epoch": 2.931995540691193, + "grad_norm": 0.4807611107826233, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2630 + }, + { + "epoch": 2.94314381270903, + "grad_norm": 0.4625583291053772, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 2.9542920847268674, + "grad_norm": 0.4163573980331421, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2650 + }, + { + "epoch": 2.9654403567447045, + "grad_norm": 0.5142832398414612, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 2660 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.4459492564201355, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 2670 + }, + { + "epoch": 2.987736900780379, + "grad_norm": 0.42905503511428833, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2680 + }, + { + "epoch": 2.998885172798216, + "grad_norm": 0.44594648480415344, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 2690 + }, + { + "epoch": 3.0, + "eval_loss": 1.8300215005874634, + "eval_runtime": 38.0349, + "eval_samples_per_second": 13.54, + "eval_steps_per_second": 1.709, + "step": 2691 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.245334546069586e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-2691/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..109a41be24d100101a02bb3add2d07495ed815bd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53b94526f26352f101088f2e4446dc5c699d8c645ca0bc67c10856cd4b0d76a7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..53b532bdf8e453df7975d3ff0ff6353e1823874e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5435a2443ac32c74ed2e4c8cedbd64ce74f8a101a9eb7dbd5fd7cdc7fb4e9e05 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..756bcbc6479df8071850ad0dd9b386f6526070ab --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7dc86fbae922d4e5b672fe298098b5fe9d69bdd4b20a9a5b5cbdb927b9b6055 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e2436144abb4db10ee49740e37d16f8a509bc7f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b58de2447329d8c261d7b43dccb8f2dcd6054ef422fa390f7efc8f1cf107938f +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..950280e3a31199cb8ad0a7d660bc5dc59a2bfcf7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/trainer_state.json @@ -0,0 +1,2571 @@ +{ + "best_metric": 1.8116765022277832, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 3588, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + }, + { + "epoch": 1.0033444816053512, + "grad_norm": 0.29757317900657654, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 900 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.32168492674827576, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 910 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.3430717885494232, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 920 + }, + { + "epoch": 1.0367892976588629, + "grad_norm": 0.3431745767593384, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 930 + }, + { + "epoch": 1.0479375696767002, + "grad_norm": 0.39787548780441284, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 1.0590858416945372, + "grad_norm": 0.3540935218334198, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 950 + }, + { + "epoch": 1.0702341137123745, + "grad_norm": 0.368484765291214, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 960 + }, + { + "epoch": 1.0813823857302118, + "grad_norm": 0.41324466466903687, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 970 + }, + { + "epoch": 1.0925306577480491, + "grad_norm": 0.3696419596672058, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 980 + }, + { + "epoch": 1.1036789297658862, + "grad_norm": 0.33832886815071106, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 990 + }, + { + "epoch": 1.1148272017837235, + "grad_norm": 0.4411991834640503, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1000 + }, + { + "epoch": 1.1259754738015608, + "grad_norm": 0.3935333788394928, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 1010 + }, + { + "epoch": 1.137123745819398, + "grad_norm": 0.32472893595695496, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1020 + }, + { + "epoch": 1.1482720178372352, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 1030 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.3995654582977295, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 1040 + }, + { + "epoch": 1.1705685618729098, + "grad_norm": 0.384056031703949, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1050 + }, + { + "epoch": 1.1817168338907469, + "grad_norm": 0.4345705211162567, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1060 + }, + { + "epoch": 1.1928651059085842, + "grad_norm": 0.3524057865142822, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1070 + }, + { + "epoch": 1.2040133779264215, + "grad_norm": 0.4047132134437561, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 1080 + }, + { + "epoch": 1.2151616499442586, + "grad_norm": 0.365824431180954, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 1090 + }, + { + "epoch": 1.2263099219620959, + "grad_norm": 0.37048354744911194, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1100 + }, + { + "epoch": 1.2374581939799332, + "grad_norm": 0.3753672242164612, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 1110 + }, + { + "epoch": 1.2486064659977703, + "grad_norm": 0.37887042760849, + "learning_rate": 0.0002, + "loss": 1.6984, + "step": 1120 + }, + { + "epoch": 1.2597547380156076, + "grad_norm": 0.3896579444408417, + "learning_rate": 0.0002, + "loss": 1.7866, + "step": 1130 + }, + { + "epoch": 1.2709030100334449, + "grad_norm": 0.3725394010543823, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 1140 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.373989999294281, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1150 + }, + { + "epoch": 1.2931995540691192, + "grad_norm": 0.4412260353565216, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1160 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.38538658618927, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1170 + }, + { + "epoch": 1.3154960981047936, + "grad_norm": 0.3644104599952698, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1180 + }, + { + "epoch": 1.326644370122631, + "grad_norm": 0.3615347743034363, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1190 + }, + { + "epoch": 1.3377926421404682, + "grad_norm": 0.4260489046573639, + "learning_rate": 0.0002, + "loss": 1.7575, + "step": 1200 + }, + { + "epoch": 1.3489409141583055, + "grad_norm": 0.35236871242523193, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1210 + }, + { + "epoch": 1.3600891861761428, + "grad_norm": 0.45456627011299133, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1220 + }, + { + "epoch": 1.37123745819398, + "grad_norm": 0.391541063785553, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 1230 + }, + { + "epoch": 1.3823857302118172, + "grad_norm": 0.37955328822135925, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1240 + }, + { + "epoch": 1.3935340022296545, + "grad_norm": 0.36955225467681885, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 1250 + }, + { + "epoch": 1.4046822742474916, + "grad_norm": 0.36156216263771057, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 1260 + }, + { + "epoch": 1.415830546265329, + "grad_norm": 0.4083487391471863, + "learning_rate": 0.0002, + "loss": 1.8091, + "step": 1270 + }, + { + "epoch": 1.4269788182831662, + "grad_norm": 0.420171320438385, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1280 + }, + { + "epoch": 1.4381270903010033, + "grad_norm": 0.3581725060939789, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1290 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.3657953441143036, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1300 + }, + { + "epoch": 1.4604236343366779, + "grad_norm": 0.3139931857585907, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 1310 + }, + { + "epoch": 1.471571906354515, + "grad_norm": 0.37750574946403503, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 1320 + }, + { + "epoch": 1.4827201783723523, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1330 + }, + { + "epoch": 1.4938684503901896, + "grad_norm": 0.39505279064178467, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1340 + }, + { + "epoch": 1.5050167224080266, + "grad_norm": 0.39977672696113586, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 1350 + }, + { + "epoch": 1.516164994425864, + "grad_norm": 0.4395383298397064, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 1.5273132664437012, + "grad_norm": 0.3452998995780945, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 1370 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.39573904871940613, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1380 + }, + { + "epoch": 1.5496098104793758, + "grad_norm": 0.4886358976364136, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1390 + }, + { + "epoch": 1.560758082497213, + "grad_norm": 0.35525891184806824, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 1400 + }, + { + "epoch": 1.57190635451505, + "grad_norm": 0.3873274028301239, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1410 + }, + { + "epoch": 1.5830546265328875, + "grad_norm": 0.35162487626075745, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 1420 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.3533175587654114, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 1430 + }, + { + "epoch": 1.605351170568562, + "grad_norm": 0.35397887229919434, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 1440 + }, + { + "epoch": 1.6164994425863992, + "grad_norm": 0.3539091646671295, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 1450 + }, + { + "epoch": 1.6276477146042363, + "grad_norm": 0.38557013869285583, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 1460 + }, + { + "epoch": 1.6387959866220736, + "grad_norm": 0.3591409921646118, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1470 + }, + { + "epoch": 1.649944258639911, + "grad_norm": 0.3776722848415375, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1480 + }, + { + "epoch": 1.661092530657748, + "grad_norm": 0.3761521875858307, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1490 + }, + { + "epoch": 1.6722408026755853, + "grad_norm": 0.33939364552497864, + "learning_rate": 0.0002, + "loss": 1.7464, + "step": 1500 + }, + { + "epoch": 1.6833890746934226, + "grad_norm": 0.3961067795753479, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 1510 + }, + { + "epoch": 1.6945373467112597, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1520 + }, + { + "epoch": 1.705685618729097, + "grad_norm": 0.4201025068759918, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1530 + }, + { + "epoch": 1.7168338907469343, + "grad_norm": 0.382280558347702, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 1540 + }, + { + "epoch": 1.7279821627647713, + "grad_norm": 0.4504372477531433, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 1550 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.36121585965156555, + "learning_rate": 0.0002, + "loss": 1.7889, + "step": 1560 + }, + { + "epoch": 1.750278706800446, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 1570 + }, + { + "epoch": 1.761426978818283, + "grad_norm": 0.3920411467552185, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 1580 + }, + { + "epoch": 1.7725752508361206, + "grad_norm": 0.4326777756214142, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1590 + }, + { + "epoch": 1.7837235228539576, + "grad_norm": 0.3582489490509033, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1600 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.36345767974853516, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 1610 + }, + { + "epoch": 1.8060200668896322, + "grad_norm": 0.3951990008354187, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1620 + }, + { + "epoch": 1.8171683389074693, + "grad_norm": 0.35174235701560974, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 1630 + }, + { + "epoch": 1.8283166109253066, + "grad_norm": 0.37005263566970825, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1640 + }, + { + "epoch": 1.839464882943144, + "grad_norm": 0.42875173687934875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 1650 + }, + { + "epoch": 1.850613154960981, + "grad_norm": 0.3646032512187958, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1660 + }, + { + "epoch": 1.8617614269788183, + "grad_norm": 0.38111618161201477, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1670 + }, + { + "epoch": 1.8729096989966556, + "grad_norm": 0.3825555443763733, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1680 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.36418095231056213, + "learning_rate": 0.0002, + "loss": 1.7599, + "step": 1690 + }, + { + "epoch": 1.89520624303233, + "grad_norm": 0.36551007628440857, + "learning_rate": 0.0002, + "loss": 1.6532, + "step": 1700 + }, + { + "epoch": 1.9063545150501673, + "grad_norm": 0.36421480774879456, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1710 + }, + { + "epoch": 1.9175027870680044, + "grad_norm": 0.3791242241859436, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 1720 + }, + { + "epoch": 1.9286510590858417, + "grad_norm": 0.36655193567276, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1730 + }, + { + "epoch": 1.939799331103679, + "grad_norm": 0.3526945412158966, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 1740 + }, + { + "epoch": 1.950947603121516, + "grad_norm": 0.41139861941337585, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 1750 + }, + { + "epoch": 1.9620958751393534, + "grad_norm": 0.41757065057754517, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1760 + }, + { + "epoch": 1.9732441471571907, + "grad_norm": 0.38956186175346375, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1770 + }, + { + "epoch": 1.9843924191750277, + "grad_norm": 0.33891627192497253, + "learning_rate": 0.0002, + "loss": 1.7653, + "step": 1780 + }, + { + "epoch": 1.9955406911928653, + "grad_norm": 0.42879191040992737, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 1790 + }, + { + "epoch": 2.0, + "eval_loss": 1.8116765022277832, + "eval_runtime": 37.9859, + "eval_samples_per_second": 13.558, + "eval_steps_per_second": 1.711, + "step": 1794 + }, + { + "epoch": 2.0066889632107023, + "grad_norm": 0.42103368043899536, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 1800 + }, + { + "epoch": 2.0178372352285394, + "grad_norm": 0.41505053639411926, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1810 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.398190438747406, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1820 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.4371621310710907, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1830 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.45679208636283875, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1840 + }, + { + "epoch": 2.0624303232998886, + "grad_norm": 0.43211811780929565, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 1850 + }, + { + "epoch": 2.0735785953177257, + "grad_norm": 0.47492915391921997, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1860 + }, + { + "epoch": 2.084726867335563, + "grad_norm": 0.41742339730262756, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1870 + }, + { + "epoch": 2.0958751393534003, + "grad_norm": 0.45789217948913574, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1880 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.43958935141563416, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1890 + }, + { + "epoch": 2.1181716833890745, + "grad_norm": 0.43991968035697937, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1900 + }, + { + "epoch": 2.129319955406912, + "grad_norm": 0.4667953848838806, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 1910 + }, + { + "epoch": 2.140468227424749, + "grad_norm": 0.42225760221481323, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1920 + }, + { + "epoch": 2.1516164994425866, + "grad_norm": 0.418850839138031, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1930 + }, + { + "epoch": 2.1627647714604237, + "grad_norm": 0.43838515877723694, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1940 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.43798115849494934, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 1950 + }, + { + "epoch": 2.1850613154960983, + "grad_norm": 0.4456610679626465, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1960 + }, + { + "epoch": 2.1962095875139354, + "grad_norm": 0.4619026482105255, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 1970 + }, + { + "epoch": 2.2073578595317724, + "grad_norm": 0.4732453525066376, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1980 + }, + { + "epoch": 2.21850613154961, + "grad_norm": 0.42551836371421814, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1990 + }, + { + "epoch": 2.229654403567447, + "grad_norm": 0.45154353976249695, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2000 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.4655696451663971, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 2010 + }, + { + "epoch": 2.2519509476031216, + "grad_norm": 0.5363447666168213, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 2020 + }, + { + "epoch": 2.2630992196209587, + "grad_norm": 0.4839927852153778, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2030 + }, + { + "epoch": 2.274247491638796, + "grad_norm": 0.4639221727848053, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 2040 + }, + { + "epoch": 2.2853957636566333, + "grad_norm": 0.46169278025627136, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2050 + }, + { + "epoch": 2.2965440356744704, + "grad_norm": 0.4582304060459137, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 2060 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.48619818687438965, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2070 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.4382200241088867, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 2080 + }, + { + "epoch": 2.329988851727982, + "grad_norm": 0.4103265106678009, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 2090 + }, + { + "epoch": 2.3411371237458196, + "grad_norm": 0.5136023759841919, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2100 + }, + { + "epoch": 2.3522853957636567, + "grad_norm": 0.46723702549934387, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 2110 + }, + { + "epoch": 2.3634336677814938, + "grad_norm": 0.42269468307495117, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 2120 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 0.42611163854599, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2130 + }, + { + "epoch": 2.3857302118171684, + "grad_norm": 0.4573901891708374, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 2140 + }, + { + "epoch": 2.3968784838350055, + "grad_norm": 0.4758673310279846, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 2150 + }, + { + "epoch": 2.408026755852843, + "grad_norm": 0.49616846442222595, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2160 + }, + { + "epoch": 2.41917502787068, + "grad_norm": 0.5278240442276001, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2170 + }, + { + "epoch": 2.430323299888517, + "grad_norm": 0.46806028485298157, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 2180 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 0.44507312774658203, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 2190 + }, + { + "epoch": 2.4526198439241917, + "grad_norm": 0.45716050267219543, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 2200 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.4226573705673218, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2210 + }, + { + "epoch": 2.4749163879598663, + "grad_norm": 0.4488418400287628, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2220 + }, + { + "epoch": 2.4860646599777034, + "grad_norm": 0.48324450850486755, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 2230 + }, + { + "epoch": 2.4972129319955405, + "grad_norm": 0.4866982400417328, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2240 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.4784172773361206, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 2250 + }, + { + "epoch": 2.519509476031215, + "grad_norm": 0.4250621199607849, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 2260 + }, + { + "epoch": 2.5306577480490526, + "grad_norm": 0.431224524974823, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 2270 + }, + { + "epoch": 2.5418060200668897, + "grad_norm": 0.3931371867656708, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2280 + }, + { + "epoch": 2.552954292084727, + "grad_norm": 0.4800887703895569, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 2290 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.4288487136363983, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2300 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.48489660024642944, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2310 + }, + { + "epoch": 2.5863991081382385, + "grad_norm": 0.4221740961074829, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 2320 + }, + { + "epoch": 2.597547380156076, + "grad_norm": 0.4413852393627167, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 2330 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.4391345679759979, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2340 + }, + { + "epoch": 2.61984392419175, + "grad_norm": 0.4824720323085785, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 2350 + }, + { + "epoch": 2.6309921962095872, + "grad_norm": 0.4023158550262451, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 2360 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 0.5107841491699219, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 2370 + }, + { + "epoch": 2.653288740245262, + "grad_norm": 0.4705312252044678, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 2380 + }, + { + "epoch": 2.6644370122630994, + "grad_norm": 0.4420899450778961, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 2390 + }, + { + "epoch": 2.6755852842809364, + "grad_norm": 0.413308709859848, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2400 + }, + { + "epoch": 2.6867335562987735, + "grad_norm": 0.4312658905982971, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2410 + }, + { + "epoch": 2.697881828316611, + "grad_norm": 0.44714513421058655, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2420 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.49152931571006775, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 2430 + }, + { + "epoch": 2.7201783723522857, + "grad_norm": 0.49458765983581543, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 2440 + }, + { + "epoch": 2.7313266443701227, + "grad_norm": 0.47838348150253296, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 2450 + }, + { + "epoch": 2.74247491638796, + "grad_norm": 0.5781240463256836, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 2460 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.4559851884841919, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2470 + }, + { + "epoch": 2.7647714604236344, + "grad_norm": 0.4452647566795349, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 2480 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.43920454382896423, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2490 + }, + { + "epoch": 2.787068004459309, + "grad_norm": 0.467780739068985, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 2500 + }, + { + "epoch": 2.798216276477146, + "grad_norm": 0.4743262529373169, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2510 + }, + { + "epoch": 2.809364548494983, + "grad_norm": 0.47944432497024536, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2520 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.48032790422439575, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 2530 + }, + { + "epoch": 2.831661092530658, + "grad_norm": 0.45569729804992676, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 2540 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.47940587997436523, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2550 + }, + { + "epoch": 2.8539576365663324, + "grad_norm": 0.5215432047843933, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 2560 + }, + { + "epoch": 2.8651059085841695, + "grad_norm": 0.4421178102493286, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 2570 + }, + { + "epoch": 2.8762541806020065, + "grad_norm": 0.45288747549057007, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 2580 + }, + { + "epoch": 2.887402452619844, + "grad_norm": 0.4472251832485199, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2590 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.4396503269672394, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2600 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.48590990900993347, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 2610 + }, + { + "epoch": 2.9208472686733558, + "grad_norm": 0.4787760376930237, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 2620 + }, + { + "epoch": 2.931995540691193, + "grad_norm": 0.4807611107826233, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2630 + }, + { + "epoch": 2.94314381270903, + "grad_norm": 0.4625583291053772, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 2.9542920847268674, + "grad_norm": 0.4163573980331421, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2650 + }, + { + "epoch": 2.9654403567447045, + "grad_norm": 0.5142832398414612, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 2660 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.4459492564201355, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 2670 + }, + { + "epoch": 2.987736900780379, + "grad_norm": 0.42905503511428833, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2680 + }, + { + "epoch": 2.998885172798216, + "grad_norm": 0.44594648480415344, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 2690 + }, + { + "epoch": 3.0, + "eval_loss": 1.8300215005874634, + "eval_runtime": 38.0349, + "eval_samples_per_second": 13.54, + "eval_steps_per_second": 1.709, + "step": 2691 + }, + { + "epoch": 3.0100334448160537, + "grad_norm": 0.4742245078086853, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2700 + }, + { + "epoch": 3.021181716833891, + "grad_norm": 0.5157448649406433, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 2710 + }, + { + "epoch": 3.032329988851728, + "grad_norm": 0.5634726285934448, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 2720 + }, + { + "epoch": 3.0434782608695654, + "grad_norm": 0.4554799199104309, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2730 + }, + { + "epoch": 3.0546265328874025, + "grad_norm": 0.6565208435058594, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 2740 + }, + { + "epoch": 3.0657748049052396, + "grad_norm": 0.6174370050430298, + "learning_rate": 0.0002, + "loss": 1.459, + "step": 2750 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.4987483024597168, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 2760 + }, + { + "epoch": 3.088071348940914, + "grad_norm": 0.5810927152633667, + "learning_rate": 0.0002, + "loss": 1.5466, + "step": 2770 + }, + { + "epoch": 3.0992196209587513, + "grad_norm": 0.5281634330749512, + "learning_rate": 0.0002, + "loss": 1.4936, + "step": 2780 + }, + { + "epoch": 3.1103678929765888, + "grad_norm": 0.5479053854942322, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 2790 + }, + { + "epoch": 3.121516164994426, + "grad_norm": 0.6192978620529175, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2800 + }, + { + "epoch": 3.132664437012263, + "grad_norm": 0.560117781162262, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2810 + }, + { + "epoch": 3.1438127090301005, + "grad_norm": 0.6067224740982056, + "learning_rate": 0.0002, + "loss": 1.5495, + "step": 2820 + }, + { + "epoch": 3.1549609810479375, + "grad_norm": 0.611287534236908, + "learning_rate": 0.0002, + "loss": 1.5239, + "step": 2830 + }, + { + "epoch": 3.1661092530657746, + "grad_norm": 0.6441587209701538, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 2840 + }, + { + "epoch": 3.177257525083612, + "grad_norm": 0.5955114364624023, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2850 + }, + { + "epoch": 3.1884057971014492, + "grad_norm": 0.5554782748222351, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 2860 + }, + { + "epoch": 3.1995540691192863, + "grad_norm": 0.5411370992660522, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 2870 + }, + { + "epoch": 3.210702341137124, + "grad_norm": 0.6152016520500183, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 2880 + }, + { + "epoch": 3.221850613154961, + "grad_norm": 0.5711581110954285, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2890 + }, + { + "epoch": 3.2329988851727984, + "grad_norm": 0.5399307012557983, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2900 + }, + { + "epoch": 3.2441471571906355, + "grad_norm": 0.60606849193573, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2910 + }, + { + "epoch": 3.2552954292084726, + "grad_norm": 0.5873523950576782, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2920 + }, + { + "epoch": 3.26644370122631, + "grad_norm": 0.6149439215660095, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 2930 + }, + { + "epoch": 3.277591973244147, + "grad_norm": 0.5940659046173096, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 2940 + }, + { + "epoch": 3.2887402452619843, + "grad_norm": 0.6846756339073181, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 2950 + }, + { + "epoch": 3.299888517279822, + "grad_norm": 0.6708254218101501, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2960 + }, + { + "epoch": 3.311036789297659, + "grad_norm": 0.5966503620147705, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 2970 + }, + { + "epoch": 3.322185061315496, + "grad_norm": 0.6328812837600708, + "learning_rate": 0.0002, + "loss": 1.5173, + "step": 2980 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6082745790481567, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 2990 + }, + { + "epoch": 3.3444816053511706, + "grad_norm": 0.6207539439201355, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 3000 + }, + { + "epoch": 3.3556298773690076, + "grad_norm": 0.5501793026924133, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 3010 + }, + { + "epoch": 3.366778149386845, + "grad_norm": 0.571275532245636, + "learning_rate": 0.0002, + "loss": 1.4428, + "step": 3020 + }, + { + "epoch": 3.3779264214046822, + "grad_norm": 0.7003518342971802, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 3030 + }, + { + "epoch": 3.3890746934225193, + "grad_norm": 0.609527587890625, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3040 + }, + { + "epoch": 3.400222965440357, + "grad_norm": 0.5880036354064941, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 3050 + }, + { + "epoch": 3.411371237458194, + "grad_norm": 0.5847334265708923, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 3060 + }, + { + "epoch": 3.4225195094760315, + "grad_norm": 0.5373924970626831, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 3070 + }, + { + "epoch": 3.4336677814938685, + "grad_norm": 0.6074833869934082, + "learning_rate": 0.0002, + "loss": 1.5215, + "step": 3080 + }, + { + "epoch": 3.4448160535117056, + "grad_norm": 0.5118414163589478, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 3090 + }, + { + "epoch": 3.4559643255295427, + "grad_norm": 0.5577956438064575, + "learning_rate": 0.0002, + "loss": 1.5006, + "step": 3100 + }, + { + "epoch": 3.46711259754738, + "grad_norm": 0.5654811859130859, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3110 + }, + { + "epoch": 3.4782608695652173, + "grad_norm": 0.6216017603874207, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 3120 + }, + { + "epoch": 3.489409141583055, + "grad_norm": 0.5983642339706421, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 3130 + }, + { + "epoch": 3.500557413600892, + "grad_norm": 0.6635708212852478, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3140 + }, + { + "epoch": 3.511705685618729, + "grad_norm": 0.6254258751869202, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 3150 + }, + { + "epoch": 3.522853957636566, + "grad_norm": 0.6359851360321045, + "learning_rate": 0.0002, + "loss": 1.4934, + "step": 3160 + }, + { + "epoch": 3.5340022296544036, + "grad_norm": 0.5938616394996643, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 3170 + }, + { + "epoch": 3.5451505016722407, + "grad_norm": 0.6360630393028259, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 3180 + }, + { + "epoch": 3.556298773690078, + "grad_norm": 0.6097670197486877, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 3190 + }, + { + "epoch": 3.5674470457079153, + "grad_norm": 0.5984025597572327, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3200 + }, + { + "epoch": 3.5785953177257523, + "grad_norm": 0.5463748574256897, + "learning_rate": 0.0002, + "loss": 1.4741, + "step": 3210 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 1.0017699003219604, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3220 + }, + { + "epoch": 3.600891861761427, + "grad_norm": 0.6519441604614258, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 3230 + }, + { + "epoch": 3.6120401337792645, + "grad_norm": 0.6457271575927734, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 3240 + }, + { + "epoch": 3.6231884057971016, + "grad_norm": 0.5898868441581726, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 3250 + }, + { + "epoch": 3.6343366778149386, + "grad_norm": 0.6612270474433899, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3260 + }, + { + "epoch": 3.6454849498327757, + "grad_norm": 0.5102090239524841, + "learning_rate": 0.0002, + "loss": 1.4537, + "step": 3270 + }, + { + "epoch": 3.6566332218506132, + "grad_norm": 0.5357231497764587, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 3280 + }, + { + "epoch": 3.6677814938684503, + "grad_norm": 0.6176130175590515, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3290 + }, + { + "epoch": 3.678929765886288, + "grad_norm": 0.6384354829788208, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3300 + }, + { + "epoch": 3.690078037904125, + "grad_norm": 0.5493269562721252, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3310 + }, + { + "epoch": 3.701226309921962, + "grad_norm": 0.5721797943115234, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 3320 + }, + { + "epoch": 3.712374581939799, + "grad_norm": 0.6667633056640625, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 3330 + }, + { + "epoch": 3.7235228539576366, + "grad_norm": 0.5713372826576233, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 3340 + }, + { + "epoch": 3.7346711259754737, + "grad_norm": 0.5925018191337585, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 3350 + }, + { + "epoch": 3.745819397993311, + "grad_norm": 0.5660955905914307, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 3360 + }, + { + "epoch": 3.7569676700111483, + "grad_norm": 0.5470759868621826, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 3370 + }, + { + "epoch": 3.7681159420289854, + "grad_norm": 0.7612935900688171, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3380 + }, + { + "epoch": 3.779264214046823, + "grad_norm": 0.577467679977417, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3390 + }, + { + "epoch": 3.79041248606466, + "grad_norm": 0.6125091910362244, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3400 + }, + { + "epoch": 3.801560758082497, + "grad_norm": 0.590386152267456, + "learning_rate": 0.0002, + "loss": 1.5463, + "step": 3410 + }, + { + "epoch": 3.8127090301003346, + "grad_norm": 0.5530361533164978, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 3420 + }, + { + "epoch": 3.8238573021181717, + "grad_norm": 0.5714079737663269, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 3430 + }, + { + "epoch": 3.8350055741360087, + "grad_norm": 0.9061086773872375, + "learning_rate": 0.0002, + "loss": 1.5324, + "step": 3440 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.6193320751190186, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3450 + }, + { + "epoch": 3.8573021181716833, + "grad_norm": 0.5831704139709473, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3460 + }, + { + "epoch": 3.868450390189521, + "grad_norm": 0.5971192717552185, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 3470 + }, + { + "epoch": 3.879598662207358, + "grad_norm": 0.6110154390335083, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 3480 + }, + { + "epoch": 3.890746934225195, + "grad_norm": 0.6644453406333923, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3490 + }, + { + "epoch": 3.901895206243032, + "grad_norm": 0.6674908399581909, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3500 + }, + { + "epoch": 3.9130434782608696, + "grad_norm": 0.5516519546508789, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3510 + }, + { + "epoch": 3.9241917502787067, + "grad_norm": 0.6704319715499878, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3520 + }, + { + "epoch": 3.9353400222965442, + "grad_norm": 0.5820314288139343, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3530 + }, + { + "epoch": 3.9464882943143813, + "grad_norm": 0.6931548714637756, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3540 + }, + { + "epoch": 3.9576365663322184, + "grad_norm": 0.6085171103477478, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 3550 + }, + { + "epoch": 3.9687848383500555, + "grad_norm": 0.5973535776138306, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3560 + }, + { + "epoch": 3.979933110367893, + "grad_norm": 0.49761658906936646, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3570 + }, + { + "epoch": 3.99108138238573, + "grad_norm": 0.6282512545585632, + "learning_rate": 0.0002, + "loss": 1.488, + "step": 3580 + }, + { + "epoch": 4.0, + "eval_loss": 1.8790398836135864, + "eval_runtime": 37.9725, + "eval_samples_per_second": 13.562, + "eval_steps_per_second": 1.712, + "step": 3588 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6604460614261146e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-3588/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..628d32ae1a0825561d12848c25bdff3ca630dee9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f312de1f7e4526babaeda66bf8cd89d28d985aeed53929cb6296fa2a8fead504 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1cf12ffb057829ed001b65599c15da59b9751de --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:447e0535337229e067f6107667c18d167ead8613ef6de34bd6a96a4a1c5dfa11 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8321355b9c2665c7141f38c7a85118e9228393f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8282a06a1f1002f83662cd44f0b64fb74dd1c3cd4ea3314328433cc8fd2906c9 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c3d65b339fd4a14485a3922a1d67f7c62981af9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b01d6d03bd598ffbe3f696181c74c43b42f556e6e259a77d2d3d11ec88e331d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f86abccb38dc3d6e61b6a099a92059f814001887 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/trainer_state.json @@ -0,0 +1,3209 @@ +{ + "best_metric": 1.8116765022277832, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 4485, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + }, + { + "epoch": 1.0033444816053512, + "grad_norm": 0.29757317900657654, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 900 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.32168492674827576, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 910 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.3430717885494232, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 920 + }, + { + "epoch": 1.0367892976588629, + "grad_norm": 0.3431745767593384, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 930 + }, + { + "epoch": 1.0479375696767002, + "grad_norm": 0.39787548780441284, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 1.0590858416945372, + "grad_norm": 0.3540935218334198, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 950 + }, + { + "epoch": 1.0702341137123745, + "grad_norm": 0.368484765291214, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 960 + }, + { + "epoch": 1.0813823857302118, + "grad_norm": 0.41324466466903687, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 970 + }, + { + "epoch": 1.0925306577480491, + "grad_norm": 0.3696419596672058, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 980 + }, + { + "epoch": 1.1036789297658862, + "grad_norm": 0.33832886815071106, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 990 + }, + { + "epoch": 1.1148272017837235, + "grad_norm": 0.4411991834640503, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1000 + }, + { + "epoch": 1.1259754738015608, + "grad_norm": 0.3935333788394928, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 1010 + }, + { + "epoch": 1.137123745819398, + "grad_norm": 0.32472893595695496, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1020 + }, + { + "epoch": 1.1482720178372352, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 1030 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.3995654582977295, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 1040 + }, + { + "epoch": 1.1705685618729098, + "grad_norm": 0.384056031703949, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1050 + }, + { + "epoch": 1.1817168338907469, + "grad_norm": 0.4345705211162567, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1060 + }, + { + "epoch": 1.1928651059085842, + "grad_norm": 0.3524057865142822, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1070 + }, + { + "epoch": 1.2040133779264215, + "grad_norm": 0.4047132134437561, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 1080 + }, + { + "epoch": 1.2151616499442586, + "grad_norm": 0.365824431180954, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 1090 + }, + { + "epoch": 1.2263099219620959, + "grad_norm": 0.37048354744911194, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1100 + }, + { + "epoch": 1.2374581939799332, + "grad_norm": 0.3753672242164612, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 1110 + }, + { + "epoch": 1.2486064659977703, + "grad_norm": 0.37887042760849, + "learning_rate": 0.0002, + "loss": 1.6984, + "step": 1120 + }, + { + "epoch": 1.2597547380156076, + "grad_norm": 0.3896579444408417, + "learning_rate": 0.0002, + "loss": 1.7866, + "step": 1130 + }, + { + "epoch": 1.2709030100334449, + "grad_norm": 0.3725394010543823, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 1140 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.373989999294281, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1150 + }, + { + "epoch": 1.2931995540691192, + "grad_norm": 0.4412260353565216, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1160 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.38538658618927, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1170 + }, + { + "epoch": 1.3154960981047936, + "grad_norm": 0.3644104599952698, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1180 + }, + { + "epoch": 1.326644370122631, + "grad_norm": 0.3615347743034363, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1190 + }, + { + "epoch": 1.3377926421404682, + "grad_norm": 0.4260489046573639, + "learning_rate": 0.0002, + "loss": 1.7575, + "step": 1200 + }, + { + "epoch": 1.3489409141583055, + "grad_norm": 0.35236871242523193, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1210 + }, + { + "epoch": 1.3600891861761428, + "grad_norm": 0.45456627011299133, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1220 + }, + { + "epoch": 1.37123745819398, + "grad_norm": 0.391541063785553, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 1230 + }, + { + "epoch": 1.3823857302118172, + "grad_norm": 0.37955328822135925, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1240 + }, + { + "epoch": 1.3935340022296545, + "grad_norm": 0.36955225467681885, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 1250 + }, + { + "epoch": 1.4046822742474916, + "grad_norm": 0.36156216263771057, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 1260 + }, + { + "epoch": 1.415830546265329, + "grad_norm": 0.4083487391471863, + "learning_rate": 0.0002, + "loss": 1.8091, + "step": 1270 + }, + { + "epoch": 1.4269788182831662, + "grad_norm": 0.420171320438385, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1280 + }, + { + "epoch": 1.4381270903010033, + "grad_norm": 0.3581725060939789, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1290 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.3657953441143036, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1300 + }, + { + "epoch": 1.4604236343366779, + "grad_norm": 0.3139931857585907, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 1310 + }, + { + "epoch": 1.471571906354515, + "grad_norm": 0.37750574946403503, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 1320 + }, + { + "epoch": 1.4827201783723523, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1330 + }, + { + "epoch": 1.4938684503901896, + "grad_norm": 0.39505279064178467, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1340 + }, + { + "epoch": 1.5050167224080266, + "grad_norm": 0.39977672696113586, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 1350 + }, + { + "epoch": 1.516164994425864, + "grad_norm": 0.4395383298397064, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 1.5273132664437012, + "grad_norm": 0.3452998995780945, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 1370 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.39573904871940613, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1380 + }, + { + "epoch": 1.5496098104793758, + "grad_norm": 0.4886358976364136, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1390 + }, + { + "epoch": 1.560758082497213, + "grad_norm": 0.35525891184806824, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 1400 + }, + { + "epoch": 1.57190635451505, + "grad_norm": 0.3873274028301239, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1410 + }, + { + "epoch": 1.5830546265328875, + "grad_norm": 0.35162487626075745, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 1420 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.3533175587654114, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 1430 + }, + { + "epoch": 1.605351170568562, + "grad_norm": 0.35397887229919434, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 1440 + }, + { + "epoch": 1.6164994425863992, + "grad_norm": 0.3539091646671295, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 1450 + }, + { + "epoch": 1.6276477146042363, + "grad_norm": 0.38557013869285583, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 1460 + }, + { + "epoch": 1.6387959866220736, + "grad_norm": 0.3591409921646118, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1470 + }, + { + "epoch": 1.649944258639911, + "grad_norm": 0.3776722848415375, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1480 + }, + { + "epoch": 1.661092530657748, + "grad_norm": 0.3761521875858307, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1490 + }, + { + "epoch": 1.6722408026755853, + "grad_norm": 0.33939364552497864, + "learning_rate": 0.0002, + "loss": 1.7464, + "step": 1500 + }, + { + "epoch": 1.6833890746934226, + "grad_norm": 0.3961067795753479, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 1510 + }, + { + "epoch": 1.6945373467112597, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1520 + }, + { + "epoch": 1.705685618729097, + "grad_norm": 0.4201025068759918, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1530 + }, + { + "epoch": 1.7168338907469343, + "grad_norm": 0.382280558347702, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 1540 + }, + { + "epoch": 1.7279821627647713, + "grad_norm": 0.4504372477531433, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 1550 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.36121585965156555, + "learning_rate": 0.0002, + "loss": 1.7889, + "step": 1560 + }, + { + "epoch": 1.750278706800446, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 1570 + }, + { + "epoch": 1.761426978818283, + "grad_norm": 0.3920411467552185, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 1580 + }, + { + "epoch": 1.7725752508361206, + "grad_norm": 0.4326777756214142, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1590 + }, + { + "epoch": 1.7837235228539576, + "grad_norm": 0.3582489490509033, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1600 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.36345767974853516, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 1610 + }, + { + "epoch": 1.8060200668896322, + "grad_norm": 0.3951990008354187, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1620 + }, + { + "epoch": 1.8171683389074693, + "grad_norm": 0.35174235701560974, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 1630 + }, + { + "epoch": 1.8283166109253066, + "grad_norm": 0.37005263566970825, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1640 + }, + { + "epoch": 1.839464882943144, + "grad_norm": 0.42875173687934875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 1650 + }, + { + "epoch": 1.850613154960981, + "grad_norm": 0.3646032512187958, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1660 + }, + { + "epoch": 1.8617614269788183, + "grad_norm": 0.38111618161201477, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1670 + }, + { + "epoch": 1.8729096989966556, + "grad_norm": 0.3825555443763733, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1680 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.36418095231056213, + "learning_rate": 0.0002, + "loss": 1.7599, + "step": 1690 + }, + { + "epoch": 1.89520624303233, + "grad_norm": 0.36551007628440857, + "learning_rate": 0.0002, + "loss": 1.6532, + "step": 1700 + }, + { + "epoch": 1.9063545150501673, + "grad_norm": 0.36421480774879456, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1710 + }, + { + "epoch": 1.9175027870680044, + "grad_norm": 0.3791242241859436, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 1720 + }, + { + "epoch": 1.9286510590858417, + "grad_norm": 0.36655193567276, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1730 + }, + { + "epoch": 1.939799331103679, + "grad_norm": 0.3526945412158966, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 1740 + }, + { + "epoch": 1.950947603121516, + "grad_norm": 0.41139861941337585, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 1750 + }, + { + "epoch": 1.9620958751393534, + "grad_norm": 0.41757065057754517, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1760 + }, + { + "epoch": 1.9732441471571907, + "grad_norm": 0.38956186175346375, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1770 + }, + { + "epoch": 1.9843924191750277, + "grad_norm": 0.33891627192497253, + "learning_rate": 0.0002, + "loss": 1.7653, + "step": 1780 + }, + { + "epoch": 1.9955406911928653, + "grad_norm": 0.42879191040992737, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 1790 + }, + { + "epoch": 2.0, + "eval_loss": 1.8116765022277832, + "eval_runtime": 37.9859, + "eval_samples_per_second": 13.558, + "eval_steps_per_second": 1.711, + "step": 1794 + }, + { + "epoch": 2.0066889632107023, + "grad_norm": 0.42103368043899536, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 1800 + }, + { + "epoch": 2.0178372352285394, + "grad_norm": 0.41505053639411926, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1810 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.398190438747406, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1820 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.4371621310710907, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1830 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.45679208636283875, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1840 + }, + { + "epoch": 2.0624303232998886, + "grad_norm": 0.43211811780929565, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 1850 + }, + { + "epoch": 2.0735785953177257, + "grad_norm": 0.47492915391921997, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1860 + }, + { + "epoch": 2.084726867335563, + "grad_norm": 0.41742339730262756, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1870 + }, + { + "epoch": 2.0958751393534003, + "grad_norm": 0.45789217948913574, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1880 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.43958935141563416, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1890 + }, + { + "epoch": 2.1181716833890745, + "grad_norm": 0.43991968035697937, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1900 + }, + { + "epoch": 2.129319955406912, + "grad_norm": 0.4667953848838806, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 1910 + }, + { + "epoch": 2.140468227424749, + "grad_norm": 0.42225760221481323, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1920 + }, + { + "epoch": 2.1516164994425866, + "grad_norm": 0.418850839138031, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1930 + }, + { + "epoch": 2.1627647714604237, + "grad_norm": 0.43838515877723694, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1940 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.43798115849494934, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 1950 + }, + { + "epoch": 2.1850613154960983, + "grad_norm": 0.4456610679626465, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1960 + }, + { + "epoch": 2.1962095875139354, + "grad_norm": 0.4619026482105255, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 1970 + }, + { + "epoch": 2.2073578595317724, + "grad_norm": 0.4732453525066376, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1980 + }, + { + "epoch": 2.21850613154961, + "grad_norm": 0.42551836371421814, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1990 + }, + { + "epoch": 2.229654403567447, + "grad_norm": 0.45154353976249695, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2000 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.4655696451663971, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 2010 + }, + { + "epoch": 2.2519509476031216, + "grad_norm": 0.5363447666168213, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 2020 + }, + { + "epoch": 2.2630992196209587, + "grad_norm": 0.4839927852153778, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2030 + }, + { + "epoch": 2.274247491638796, + "grad_norm": 0.4639221727848053, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 2040 + }, + { + "epoch": 2.2853957636566333, + "grad_norm": 0.46169278025627136, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2050 + }, + { + "epoch": 2.2965440356744704, + "grad_norm": 0.4582304060459137, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 2060 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.48619818687438965, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2070 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.4382200241088867, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 2080 + }, + { + "epoch": 2.329988851727982, + "grad_norm": 0.4103265106678009, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 2090 + }, + { + "epoch": 2.3411371237458196, + "grad_norm": 0.5136023759841919, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2100 + }, + { + "epoch": 2.3522853957636567, + "grad_norm": 0.46723702549934387, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 2110 + }, + { + "epoch": 2.3634336677814938, + "grad_norm": 0.42269468307495117, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 2120 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 0.42611163854599, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2130 + }, + { + "epoch": 2.3857302118171684, + "grad_norm": 0.4573901891708374, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 2140 + }, + { + "epoch": 2.3968784838350055, + "grad_norm": 0.4758673310279846, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 2150 + }, + { + "epoch": 2.408026755852843, + "grad_norm": 0.49616846442222595, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2160 + }, + { + "epoch": 2.41917502787068, + "grad_norm": 0.5278240442276001, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2170 + }, + { + "epoch": 2.430323299888517, + "grad_norm": 0.46806028485298157, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 2180 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 0.44507312774658203, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 2190 + }, + { + "epoch": 2.4526198439241917, + "grad_norm": 0.45716050267219543, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 2200 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.4226573705673218, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2210 + }, + { + "epoch": 2.4749163879598663, + "grad_norm": 0.4488418400287628, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2220 + }, + { + "epoch": 2.4860646599777034, + "grad_norm": 0.48324450850486755, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 2230 + }, + { + "epoch": 2.4972129319955405, + "grad_norm": 0.4866982400417328, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2240 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.4784172773361206, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 2250 + }, + { + "epoch": 2.519509476031215, + "grad_norm": 0.4250621199607849, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 2260 + }, + { + "epoch": 2.5306577480490526, + "grad_norm": 0.431224524974823, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 2270 + }, + { + "epoch": 2.5418060200668897, + "grad_norm": 0.3931371867656708, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2280 + }, + { + "epoch": 2.552954292084727, + "grad_norm": 0.4800887703895569, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 2290 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.4288487136363983, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2300 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.48489660024642944, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2310 + }, + { + "epoch": 2.5863991081382385, + "grad_norm": 0.4221740961074829, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 2320 + }, + { + "epoch": 2.597547380156076, + "grad_norm": 0.4413852393627167, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 2330 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.4391345679759979, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2340 + }, + { + "epoch": 2.61984392419175, + "grad_norm": 0.4824720323085785, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 2350 + }, + { + "epoch": 2.6309921962095872, + "grad_norm": 0.4023158550262451, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 2360 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 0.5107841491699219, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 2370 + }, + { + "epoch": 2.653288740245262, + "grad_norm": 0.4705312252044678, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 2380 + }, + { + "epoch": 2.6644370122630994, + "grad_norm": 0.4420899450778961, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 2390 + }, + { + "epoch": 2.6755852842809364, + "grad_norm": 0.413308709859848, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2400 + }, + { + "epoch": 2.6867335562987735, + "grad_norm": 0.4312658905982971, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2410 + }, + { + "epoch": 2.697881828316611, + "grad_norm": 0.44714513421058655, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2420 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.49152931571006775, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 2430 + }, + { + "epoch": 2.7201783723522857, + "grad_norm": 0.49458765983581543, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 2440 + }, + { + "epoch": 2.7313266443701227, + "grad_norm": 0.47838348150253296, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 2450 + }, + { + "epoch": 2.74247491638796, + "grad_norm": 0.5781240463256836, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 2460 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.4559851884841919, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2470 + }, + { + "epoch": 2.7647714604236344, + "grad_norm": 0.4452647566795349, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 2480 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.43920454382896423, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2490 + }, + { + "epoch": 2.787068004459309, + "grad_norm": 0.467780739068985, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 2500 + }, + { + "epoch": 2.798216276477146, + "grad_norm": 0.4743262529373169, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2510 + }, + { + "epoch": 2.809364548494983, + "grad_norm": 0.47944432497024536, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2520 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.48032790422439575, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 2530 + }, + { + "epoch": 2.831661092530658, + "grad_norm": 0.45569729804992676, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 2540 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.47940587997436523, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2550 + }, + { + "epoch": 2.8539576365663324, + "grad_norm": 0.5215432047843933, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 2560 + }, + { + "epoch": 2.8651059085841695, + "grad_norm": 0.4421178102493286, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 2570 + }, + { + "epoch": 2.8762541806020065, + "grad_norm": 0.45288747549057007, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 2580 + }, + { + "epoch": 2.887402452619844, + "grad_norm": 0.4472251832485199, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2590 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.4396503269672394, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2600 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.48590990900993347, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 2610 + }, + { + "epoch": 2.9208472686733558, + "grad_norm": 0.4787760376930237, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 2620 + }, + { + "epoch": 2.931995540691193, + "grad_norm": 0.4807611107826233, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2630 + }, + { + "epoch": 2.94314381270903, + "grad_norm": 0.4625583291053772, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 2.9542920847268674, + "grad_norm": 0.4163573980331421, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2650 + }, + { + "epoch": 2.9654403567447045, + "grad_norm": 0.5142832398414612, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 2660 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.4459492564201355, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 2670 + }, + { + "epoch": 2.987736900780379, + "grad_norm": 0.42905503511428833, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2680 + }, + { + "epoch": 2.998885172798216, + "grad_norm": 0.44594648480415344, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 2690 + }, + { + "epoch": 3.0, + "eval_loss": 1.8300215005874634, + "eval_runtime": 38.0349, + "eval_samples_per_second": 13.54, + "eval_steps_per_second": 1.709, + "step": 2691 + }, + { + "epoch": 3.0100334448160537, + "grad_norm": 0.4742245078086853, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2700 + }, + { + "epoch": 3.021181716833891, + "grad_norm": 0.5157448649406433, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 2710 + }, + { + "epoch": 3.032329988851728, + "grad_norm": 0.5634726285934448, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 2720 + }, + { + "epoch": 3.0434782608695654, + "grad_norm": 0.4554799199104309, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2730 + }, + { + "epoch": 3.0546265328874025, + "grad_norm": 0.6565208435058594, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 2740 + }, + { + "epoch": 3.0657748049052396, + "grad_norm": 0.6174370050430298, + "learning_rate": 0.0002, + "loss": 1.459, + "step": 2750 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.4987483024597168, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 2760 + }, + { + "epoch": 3.088071348940914, + "grad_norm": 0.5810927152633667, + "learning_rate": 0.0002, + "loss": 1.5466, + "step": 2770 + }, + { + "epoch": 3.0992196209587513, + "grad_norm": 0.5281634330749512, + "learning_rate": 0.0002, + "loss": 1.4936, + "step": 2780 + }, + { + "epoch": 3.1103678929765888, + "grad_norm": 0.5479053854942322, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 2790 + }, + { + "epoch": 3.121516164994426, + "grad_norm": 0.6192978620529175, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2800 + }, + { + "epoch": 3.132664437012263, + "grad_norm": 0.560117781162262, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2810 + }, + { + "epoch": 3.1438127090301005, + "grad_norm": 0.6067224740982056, + "learning_rate": 0.0002, + "loss": 1.5495, + "step": 2820 + }, + { + "epoch": 3.1549609810479375, + "grad_norm": 0.611287534236908, + "learning_rate": 0.0002, + "loss": 1.5239, + "step": 2830 + }, + { + "epoch": 3.1661092530657746, + "grad_norm": 0.6441587209701538, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 2840 + }, + { + "epoch": 3.177257525083612, + "grad_norm": 0.5955114364624023, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2850 + }, + { + "epoch": 3.1884057971014492, + "grad_norm": 0.5554782748222351, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 2860 + }, + { + "epoch": 3.1995540691192863, + "grad_norm": 0.5411370992660522, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 2870 + }, + { + "epoch": 3.210702341137124, + "grad_norm": 0.6152016520500183, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 2880 + }, + { + "epoch": 3.221850613154961, + "grad_norm": 0.5711581110954285, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2890 + }, + { + "epoch": 3.2329988851727984, + "grad_norm": 0.5399307012557983, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2900 + }, + { + "epoch": 3.2441471571906355, + "grad_norm": 0.60606849193573, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2910 + }, + { + "epoch": 3.2552954292084726, + "grad_norm": 0.5873523950576782, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2920 + }, + { + "epoch": 3.26644370122631, + "grad_norm": 0.6149439215660095, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 2930 + }, + { + "epoch": 3.277591973244147, + "grad_norm": 0.5940659046173096, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 2940 + }, + { + "epoch": 3.2887402452619843, + "grad_norm": 0.6846756339073181, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 2950 + }, + { + "epoch": 3.299888517279822, + "grad_norm": 0.6708254218101501, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2960 + }, + { + "epoch": 3.311036789297659, + "grad_norm": 0.5966503620147705, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 2970 + }, + { + "epoch": 3.322185061315496, + "grad_norm": 0.6328812837600708, + "learning_rate": 0.0002, + "loss": 1.5173, + "step": 2980 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6082745790481567, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 2990 + }, + { + "epoch": 3.3444816053511706, + "grad_norm": 0.6207539439201355, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 3000 + }, + { + "epoch": 3.3556298773690076, + "grad_norm": 0.5501793026924133, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 3010 + }, + { + "epoch": 3.366778149386845, + "grad_norm": 0.571275532245636, + "learning_rate": 0.0002, + "loss": 1.4428, + "step": 3020 + }, + { + "epoch": 3.3779264214046822, + "grad_norm": 0.7003518342971802, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 3030 + }, + { + "epoch": 3.3890746934225193, + "grad_norm": 0.609527587890625, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3040 + }, + { + "epoch": 3.400222965440357, + "grad_norm": 0.5880036354064941, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 3050 + }, + { + "epoch": 3.411371237458194, + "grad_norm": 0.5847334265708923, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 3060 + }, + { + "epoch": 3.4225195094760315, + "grad_norm": 0.5373924970626831, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 3070 + }, + { + "epoch": 3.4336677814938685, + "grad_norm": 0.6074833869934082, + "learning_rate": 0.0002, + "loss": 1.5215, + "step": 3080 + }, + { + "epoch": 3.4448160535117056, + "grad_norm": 0.5118414163589478, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 3090 + }, + { + "epoch": 3.4559643255295427, + "grad_norm": 0.5577956438064575, + "learning_rate": 0.0002, + "loss": 1.5006, + "step": 3100 + }, + { + "epoch": 3.46711259754738, + "grad_norm": 0.5654811859130859, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3110 + }, + { + "epoch": 3.4782608695652173, + "grad_norm": 0.6216017603874207, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 3120 + }, + { + "epoch": 3.489409141583055, + "grad_norm": 0.5983642339706421, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 3130 + }, + { + "epoch": 3.500557413600892, + "grad_norm": 0.6635708212852478, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3140 + }, + { + "epoch": 3.511705685618729, + "grad_norm": 0.6254258751869202, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 3150 + }, + { + "epoch": 3.522853957636566, + "grad_norm": 0.6359851360321045, + "learning_rate": 0.0002, + "loss": 1.4934, + "step": 3160 + }, + { + "epoch": 3.5340022296544036, + "grad_norm": 0.5938616394996643, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 3170 + }, + { + "epoch": 3.5451505016722407, + "grad_norm": 0.6360630393028259, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 3180 + }, + { + "epoch": 3.556298773690078, + "grad_norm": 0.6097670197486877, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 3190 + }, + { + "epoch": 3.5674470457079153, + "grad_norm": 0.5984025597572327, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3200 + }, + { + "epoch": 3.5785953177257523, + "grad_norm": 0.5463748574256897, + "learning_rate": 0.0002, + "loss": 1.4741, + "step": 3210 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 1.0017699003219604, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3220 + }, + { + "epoch": 3.600891861761427, + "grad_norm": 0.6519441604614258, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 3230 + }, + { + "epoch": 3.6120401337792645, + "grad_norm": 0.6457271575927734, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 3240 + }, + { + "epoch": 3.6231884057971016, + "grad_norm": 0.5898868441581726, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 3250 + }, + { + "epoch": 3.6343366778149386, + "grad_norm": 0.6612270474433899, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3260 + }, + { + "epoch": 3.6454849498327757, + "grad_norm": 0.5102090239524841, + "learning_rate": 0.0002, + "loss": 1.4537, + "step": 3270 + }, + { + "epoch": 3.6566332218506132, + "grad_norm": 0.5357231497764587, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 3280 + }, + { + "epoch": 3.6677814938684503, + "grad_norm": 0.6176130175590515, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3290 + }, + { + "epoch": 3.678929765886288, + "grad_norm": 0.6384354829788208, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3300 + }, + { + "epoch": 3.690078037904125, + "grad_norm": 0.5493269562721252, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3310 + }, + { + "epoch": 3.701226309921962, + "grad_norm": 0.5721797943115234, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 3320 + }, + { + "epoch": 3.712374581939799, + "grad_norm": 0.6667633056640625, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 3330 + }, + { + "epoch": 3.7235228539576366, + "grad_norm": 0.5713372826576233, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 3340 + }, + { + "epoch": 3.7346711259754737, + "grad_norm": 0.5925018191337585, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 3350 + }, + { + "epoch": 3.745819397993311, + "grad_norm": 0.5660955905914307, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 3360 + }, + { + "epoch": 3.7569676700111483, + "grad_norm": 0.5470759868621826, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 3370 + }, + { + "epoch": 3.7681159420289854, + "grad_norm": 0.7612935900688171, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3380 + }, + { + "epoch": 3.779264214046823, + "grad_norm": 0.577467679977417, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3390 + }, + { + "epoch": 3.79041248606466, + "grad_norm": 0.6125091910362244, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3400 + }, + { + "epoch": 3.801560758082497, + "grad_norm": 0.590386152267456, + "learning_rate": 0.0002, + "loss": 1.5463, + "step": 3410 + }, + { + "epoch": 3.8127090301003346, + "grad_norm": 0.5530361533164978, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 3420 + }, + { + "epoch": 3.8238573021181717, + "grad_norm": 0.5714079737663269, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 3430 + }, + { + "epoch": 3.8350055741360087, + "grad_norm": 0.9061086773872375, + "learning_rate": 0.0002, + "loss": 1.5324, + "step": 3440 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.6193320751190186, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3450 + }, + { + "epoch": 3.8573021181716833, + "grad_norm": 0.5831704139709473, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3460 + }, + { + "epoch": 3.868450390189521, + "grad_norm": 0.5971192717552185, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 3470 + }, + { + "epoch": 3.879598662207358, + "grad_norm": 0.6110154390335083, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 3480 + }, + { + "epoch": 3.890746934225195, + "grad_norm": 0.6644453406333923, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3490 + }, + { + "epoch": 3.901895206243032, + "grad_norm": 0.6674908399581909, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3500 + }, + { + "epoch": 3.9130434782608696, + "grad_norm": 0.5516519546508789, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3510 + }, + { + "epoch": 3.9241917502787067, + "grad_norm": 0.6704319715499878, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3520 + }, + { + "epoch": 3.9353400222965442, + "grad_norm": 0.5820314288139343, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3530 + }, + { + "epoch": 3.9464882943143813, + "grad_norm": 0.6931548714637756, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3540 + }, + { + "epoch": 3.9576365663322184, + "grad_norm": 0.6085171103477478, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 3550 + }, + { + "epoch": 3.9687848383500555, + "grad_norm": 0.5973535776138306, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3560 + }, + { + "epoch": 3.979933110367893, + "grad_norm": 0.49761658906936646, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3570 + }, + { + "epoch": 3.99108138238573, + "grad_norm": 0.6282512545585632, + "learning_rate": 0.0002, + "loss": 1.488, + "step": 3580 + }, + { + "epoch": 4.0, + "eval_loss": 1.8790398836135864, + "eval_runtime": 37.9725, + "eval_samples_per_second": 13.562, + "eval_steps_per_second": 1.712, + "step": 3588 + }, + { + "epoch": 4.002229654403568, + "grad_norm": 0.6402973532676697, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3590 + }, + { + "epoch": 4.013377926421405, + "grad_norm": 0.7791030406951904, + "learning_rate": 0.0002, + "loss": 1.3695, + "step": 3600 + }, + { + "epoch": 4.024526198439242, + "grad_norm": 0.7136624455451965, + "learning_rate": 0.0002, + "loss": 1.3545, + "step": 3610 + }, + { + "epoch": 4.035674470457079, + "grad_norm": 0.7608486413955688, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 3620 + }, + { + "epoch": 4.046822742474917, + "grad_norm": 0.7486591935157776, + "learning_rate": 0.0002, + "loss": 1.3067, + "step": 3630 + }, + { + "epoch": 4.057971014492754, + "grad_norm": 0.7576302289962769, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 3640 + }, + { + "epoch": 4.069119286510591, + "grad_norm": 0.7358254194259644, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3650 + }, + { + "epoch": 4.080267558528428, + "grad_norm": 0.821326494216919, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 3660 + }, + { + "epoch": 4.091415830546265, + "grad_norm": 0.7996482253074646, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 3670 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 0.8527022004127502, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3680 + }, + { + "epoch": 4.11371237458194, + "grad_norm": 0.7313576340675354, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 3690 + }, + { + "epoch": 4.124860646599777, + "grad_norm": 0.7854588627815247, + "learning_rate": 0.0002, + "loss": 1.3307, + "step": 3700 + }, + { + "epoch": 4.136008918617614, + "grad_norm": 0.6588303446769714, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 3710 + }, + { + "epoch": 4.147157190635451, + "grad_norm": 0.7986254692077637, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3720 + }, + { + "epoch": 4.1583054626532885, + "grad_norm": 0.6864156126976013, + "learning_rate": 0.0002, + "loss": 1.3505, + "step": 3730 + }, + { + "epoch": 4.169453734671126, + "grad_norm": 0.8197885155677795, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 3740 + }, + { + "epoch": 4.1806020066889635, + "grad_norm": 0.7169402837753296, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 3750 + }, + { + "epoch": 4.191750278706801, + "grad_norm": 0.7948839068412781, + "learning_rate": 0.0002, + "loss": 1.4388, + "step": 3760 + }, + { + "epoch": 4.202898550724638, + "grad_norm": 0.6775302290916443, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3770 + }, + { + "epoch": 4.214046822742475, + "grad_norm": 0.8913543820381165, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3780 + }, + { + "epoch": 4.225195094760312, + "grad_norm": 0.8046368360519409, + "learning_rate": 0.0002, + "loss": 1.4251, + "step": 3790 + }, + { + "epoch": 4.236343366778149, + "grad_norm": 0.9359563589096069, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3800 + }, + { + "epoch": 4.247491638795987, + "grad_norm": 0.8012228608131409, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 3810 + }, + { + "epoch": 4.258639910813824, + "grad_norm": 0.8405851125717163, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 3820 + }, + { + "epoch": 4.269788182831661, + "grad_norm": 0.7812899351119995, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 3830 + }, + { + "epoch": 4.280936454849498, + "grad_norm": 0.8192463517189026, + "learning_rate": 0.0002, + "loss": 1.4006, + "step": 3840 + }, + { + "epoch": 4.292084726867335, + "grad_norm": 0.6937220096588135, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 3850 + }, + { + "epoch": 4.303232998885173, + "grad_norm": 0.7245703935623169, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 3860 + }, + { + "epoch": 4.31438127090301, + "grad_norm": 0.7816787362098694, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 3870 + }, + { + "epoch": 4.325529542920847, + "grad_norm": 0.7904975414276123, + "learning_rate": 0.0002, + "loss": 1.4316, + "step": 3880 + }, + { + "epoch": 4.336677814938684, + "grad_norm": 1.0394847393035889, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 3890 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 0.7044078707695007, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3900 + }, + { + "epoch": 4.358974358974359, + "grad_norm": 0.8852819204330444, + "learning_rate": 0.0002, + "loss": 1.3185, + "step": 3910 + }, + { + "epoch": 4.3701226309921966, + "grad_norm": 0.7712758779525757, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3920 + }, + { + "epoch": 4.381270903010034, + "grad_norm": 0.7677774429321289, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 3930 + }, + { + "epoch": 4.392419175027871, + "grad_norm": 0.7450921535491943, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 3940 + }, + { + "epoch": 4.403567447045708, + "grad_norm": 0.7802795767784119, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 3950 + }, + { + "epoch": 4.414715719063545, + "grad_norm": 0.8976508378982544, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 3960 + }, + { + "epoch": 4.425863991081382, + "grad_norm": 0.8148922324180603, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3970 + }, + { + "epoch": 4.43701226309922, + "grad_norm": 0.7490504384040833, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 3980 + }, + { + "epoch": 4.448160535117057, + "grad_norm": 0.753652036190033, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 3990 + }, + { + "epoch": 4.459308807134894, + "grad_norm": 0.803986668586731, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4000 + }, + { + "epoch": 4.470457079152731, + "grad_norm": 0.8643081784248352, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 4010 + }, + { + "epoch": 4.481605351170568, + "grad_norm": 0.8298280835151672, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 4020 + }, + { + "epoch": 4.492753623188406, + "grad_norm": 0.705355703830719, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4030 + }, + { + "epoch": 4.503901895206243, + "grad_norm": 0.7845711708068848, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 4040 + }, + { + "epoch": 4.51505016722408, + "grad_norm": 0.8056256175041199, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 4050 + }, + { + "epoch": 4.5261984392419174, + "grad_norm": 0.7080171704292297, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4060 + }, + { + "epoch": 4.5373467112597545, + "grad_norm": 0.778388261795044, + "learning_rate": 0.0002, + "loss": 1.335, + "step": 4070 + }, + { + "epoch": 4.548494983277592, + "grad_norm": 0.7337639927864075, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 4080 + }, + { + "epoch": 4.55964325529543, + "grad_norm": 0.815322756767273, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4090 + }, + { + "epoch": 4.570791527313267, + "grad_norm": 0.8817179203033447, + "learning_rate": 0.0002, + "loss": 1.4509, + "step": 4100 + }, + { + "epoch": 4.581939799331104, + "grad_norm": 0.7526060342788696, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 4110 + }, + { + "epoch": 4.593088071348941, + "grad_norm": 0.920465350151062, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4120 + }, + { + "epoch": 4.604236343366778, + "grad_norm": 0.7509559392929077, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4130 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.799469530582428, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 4140 + }, + { + "epoch": 4.626532887402453, + "grad_norm": 0.8099892735481262, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4150 + }, + { + "epoch": 4.63768115942029, + "grad_norm": 0.7790375351905823, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4160 + }, + { + "epoch": 4.648829431438127, + "grad_norm": 0.8292977809906006, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 4170 + }, + { + "epoch": 4.659977703455964, + "grad_norm": 0.8312386274337769, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 4180 + }, + { + "epoch": 4.671125975473801, + "grad_norm": 0.7348753809928894, + "learning_rate": 0.0002, + "loss": 1.4301, + "step": 4190 + }, + { + "epoch": 4.682274247491639, + "grad_norm": 0.8006551265716553, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 4200 + }, + { + "epoch": 4.693422519509476, + "grad_norm": 0.8477752804756165, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4210 + }, + { + "epoch": 4.704570791527313, + "grad_norm": 0.7056546211242676, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 4220 + }, + { + "epoch": 4.7157190635451505, + "grad_norm": 0.7858873009681702, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 4230 + }, + { + "epoch": 4.7268673355629875, + "grad_norm": 0.6968740224838257, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 4240 + }, + { + "epoch": 4.738015607580825, + "grad_norm": 0.7886689901351929, + "learning_rate": 0.0002, + "loss": 1.3594, + "step": 4250 + }, + { + "epoch": 4.749163879598662, + "grad_norm": 0.8935304880142212, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4260 + }, + { + "epoch": 4.7603121516165, + "grad_norm": 0.8395553231239319, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 4270 + }, + { + "epoch": 4.771460423634337, + "grad_norm": 0.817263126373291, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4280 + }, + { + "epoch": 4.782608695652174, + "grad_norm": 0.7912008166313171, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 4290 + }, + { + "epoch": 4.793756967670011, + "grad_norm": 0.6637866497039795, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4300 + }, + { + "epoch": 4.804905239687848, + "grad_norm": 1.0709338188171387, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4310 + }, + { + "epoch": 4.816053511705686, + "grad_norm": 0.8179698586463928, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 4320 + }, + { + "epoch": 4.827201783723523, + "grad_norm": 0.7952052354812622, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4330 + }, + { + "epoch": 4.83835005574136, + "grad_norm": 0.7235367894172668, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 4340 + }, + { + "epoch": 4.849498327759197, + "grad_norm": 0.8484606742858887, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 4350 + }, + { + "epoch": 4.860646599777034, + "grad_norm": 0.7344942092895508, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 4360 + }, + { + "epoch": 4.871794871794872, + "grad_norm": 0.9718546867370605, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 4370 + }, + { + "epoch": 4.882943143812709, + "grad_norm": 0.8174259066581726, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 4380 + }, + { + "epoch": 4.894091415830546, + "grad_norm": 0.8097165822982788, + "learning_rate": 0.0002, + "loss": 1.3244, + "step": 4390 + }, + { + "epoch": 4.9052396878483835, + "grad_norm": 0.756388783454895, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4400 + }, + { + "epoch": 4.916387959866221, + "grad_norm": 0.8324617743492126, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 4410 + }, + { + "epoch": 4.927536231884058, + "grad_norm": 0.8949803709983826, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 4420 + }, + { + "epoch": 4.938684503901895, + "grad_norm": 0.7663722634315491, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4430 + }, + { + "epoch": 4.949832775919733, + "grad_norm": 0.7727946043014526, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 4440 + }, + { + "epoch": 4.96098104793757, + "grad_norm": 0.6872350573539734, + "learning_rate": 0.0002, + "loss": 1.4351, + "step": 4450 + }, + { + "epoch": 4.972129319955407, + "grad_norm": 0.754357099533081, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 4460 + }, + { + "epoch": 4.983277591973244, + "grad_norm": 0.8068729639053345, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 4470 + }, + { + "epoch": 4.994425863991081, + "grad_norm": 0.8200556635856628, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 4480 + }, + { + "epoch": 5.0, + "eval_loss": 1.9543706178665161, + "eval_runtime": 37.9369, + "eval_samples_per_second": 13.575, + "eval_steps_per_second": 1.713, + "step": 4485 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0755575767826432e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-4485/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e3a2dd2b20fa40f1c4114946cba5a663c9f0f0d6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e99f7f2672cd06f46ece6c34886fbd3956ac0809c46756a2fa5b7795ca19ea7e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a89a0a14f540ba62cfd98a9ac38e370adefc0894 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdf51b70ed45995db5a6778f96706d6add63845a0a83878d5792505d61331c7d +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..35d2570c5910688dd73dbc0686904e1e0eb5d004 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4af3562f6997bcf619f284e7a03b283589ef41e2706319fe38a04dcb704a0896 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..019fa194d58272928b1a8d6fca24ddcca947a2a9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19a34f40da364fa970a4c72ca5245e10b634d701e238507a82c970eea519e4b1 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e3eb242e955d93cc97db42e549280c3bd371e55f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/trainer_state.json @@ -0,0 +1,3847 @@ +{ + "best_metric": 1.8116765022277832, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 5382, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + }, + { + "epoch": 1.0033444816053512, + "grad_norm": 0.29757317900657654, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 900 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.32168492674827576, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 910 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.3430717885494232, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 920 + }, + { + "epoch": 1.0367892976588629, + "grad_norm": 0.3431745767593384, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 930 + }, + { + "epoch": 1.0479375696767002, + "grad_norm": 0.39787548780441284, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 1.0590858416945372, + "grad_norm": 0.3540935218334198, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 950 + }, + { + "epoch": 1.0702341137123745, + "grad_norm": 0.368484765291214, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 960 + }, + { + "epoch": 1.0813823857302118, + "grad_norm": 0.41324466466903687, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 970 + }, + { + "epoch": 1.0925306577480491, + "grad_norm": 0.3696419596672058, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 980 + }, + { + "epoch": 1.1036789297658862, + "grad_norm": 0.33832886815071106, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 990 + }, + { + "epoch": 1.1148272017837235, + "grad_norm": 0.4411991834640503, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1000 + }, + { + "epoch": 1.1259754738015608, + "grad_norm": 0.3935333788394928, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 1010 + }, + { + "epoch": 1.137123745819398, + "grad_norm": 0.32472893595695496, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1020 + }, + { + "epoch": 1.1482720178372352, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 1030 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.3995654582977295, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 1040 + }, + { + "epoch": 1.1705685618729098, + "grad_norm": 0.384056031703949, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1050 + }, + { + "epoch": 1.1817168338907469, + "grad_norm": 0.4345705211162567, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1060 + }, + { + "epoch": 1.1928651059085842, + "grad_norm": 0.3524057865142822, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1070 + }, + { + "epoch": 1.2040133779264215, + "grad_norm": 0.4047132134437561, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 1080 + }, + { + "epoch": 1.2151616499442586, + "grad_norm": 0.365824431180954, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 1090 + }, + { + "epoch": 1.2263099219620959, + "grad_norm": 0.37048354744911194, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1100 + }, + { + "epoch": 1.2374581939799332, + "grad_norm": 0.3753672242164612, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 1110 + }, + { + "epoch": 1.2486064659977703, + "grad_norm": 0.37887042760849, + "learning_rate": 0.0002, + "loss": 1.6984, + "step": 1120 + }, + { + "epoch": 1.2597547380156076, + "grad_norm": 0.3896579444408417, + "learning_rate": 0.0002, + "loss": 1.7866, + "step": 1130 + }, + { + "epoch": 1.2709030100334449, + "grad_norm": 0.3725394010543823, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 1140 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.373989999294281, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1150 + }, + { + "epoch": 1.2931995540691192, + "grad_norm": 0.4412260353565216, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1160 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.38538658618927, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1170 + }, + { + "epoch": 1.3154960981047936, + "grad_norm": 0.3644104599952698, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1180 + }, + { + "epoch": 1.326644370122631, + "grad_norm": 0.3615347743034363, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1190 + }, + { + "epoch": 1.3377926421404682, + "grad_norm": 0.4260489046573639, + "learning_rate": 0.0002, + "loss": 1.7575, + "step": 1200 + }, + { + "epoch": 1.3489409141583055, + "grad_norm": 0.35236871242523193, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1210 + }, + { + "epoch": 1.3600891861761428, + "grad_norm": 0.45456627011299133, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1220 + }, + { + "epoch": 1.37123745819398, + "grad_norm": 0.391541063785553, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 1230 + }, + { + "epoch": 1.3823857302118172, + "grad_norm": 0.37955328822135925, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1240 + }, + { + "epoch": 1.3935340022296545, + "grad_norm": 0.36955225467681885, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 1250 + }, + { + "epoch": 1.4046822742474916, + "grad_norm": 0.36156216263771057, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 1260 + }, + { + "epoch": 1.415830546265329, + "grad_norm": 0.4083487391471863, + "learning_rate": 0.0002, + "loss": 1.8091, + "step": 1270 + }, + { + "epoch": 1.4269788182831662, + "grad_norm": 0.420171320438385, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1280 + }, + { + "epoch": 1.4381270903010033, + "grad_norm": 0.3581725060939789, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1290 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.3657953441143036, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1300 + }, + { + "epoch": 1.4604236343366779, + "grad_norm": 0.3139931857585907, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 1310 + }, + { + "epoch": 1.471571906354515, + "grad_norm": 0.37750574946403503, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 1320 + }, + { + "epoch": 1.4827201783723523, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1330 + }, + { + "epoch": 1.4938684503901896, + "grad_norm": 0.39505279064178467, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1340 + }, + { + "epoch": 1.5050167224080266, + "grad_norm": 0.39977672696113586, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 1350 + }, + { + "epoch": 1.516164994425864, + "grad_norm": 0.4395383298397064, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 1.5273132664437012, + "grad_norm": 0.3452998995780945, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 1370 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.39573904871940613, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1380 + }, + { + "epoch": 1.5496098104793758, + "grad_norm": 0.4886358976364136, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1390 + }, + { + "epoch": 1.560758082497213, + "grad_norm": 0.35525891184806824, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 1400 + }, + { + "epoch": 1.57190635451505, + "grad_norm": 0.3873274028301239, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1410 + }, + { + "epoch": 1.5830546265328875, + "grad_norm": 0.35162487626075745, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 1420 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.3533175587654114, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 1430 + }, + { + "epoch": 1.605351170568562, + "grad_norm": 0.35397887229919434, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 1440 + }, + { + "epoch": 1.6164994425863992, + "grad_norm": 0.3539091646671295, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 1450 + }, + { + "epoch": 1.6276477146042363, + "grad_norm": 0.38557013869285583, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 1460 + }, + { + "epoch": 1.6387959866220736, + "grad_norm": 0.3591409921646118, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1470 + }, + { + "epoch": 1.649944258639911, + "grad_norm": 0.3776722848415375, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1480 + }, + { + "epoch": 1.661092530657748, + "grad_norm": 0.3761521875858307, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1490 + }, + { + "epoch": 1.6722408026755853, + "grad_norm": 0.33939364552497864, + "learning_rate": 0.0002, + "loss": 1.7464, + "step": 1500 + }, + { + "epoch": 1.6833890746934226, + "grad_norm": 0.3961067795753479, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 1510 + }, + { + "epoch": 1.6945373467112597, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1520 + }, + { + "epoch": 1.705685618729097, + "grad_norm": 0.4201025068759918, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1530 + }, + { + "epoch": 1.7168338907469343, + "grad_norm": 0.382280558347702, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 1540 + }, + { + "epoch": 1.7279821627647713, + "grad_norm": 0.4504372477531433, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 1550 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.36121585965156555, + "learning_rate": 0.0002, + "loss": 1.7889, + "step": 1560 + }, + { + "epoch": 1.750278706800446, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 1570 + }, + { + "epoch": 1.761426978818283, + "grad_norm": 0.3920411467552185, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 1580 + }, + { + "epoch": 1.7725752508361206, + "grad_norm": 0.4326777756214142, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1590 + }, + { + "epoch": 1.7837235228539576, + "grad_norm": 0.3582489490509033, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1600 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.36345767974853516, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 1610 + }, + { + "epoch": 1.8060200668896322, + "grad_norm": 0.3951990008354187, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1620 + }, + { + "epoch": 1.8171683389074693, + "grad_norm": 0.35174235701560974, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 1630 + }, + { + "epoch": 1.8283166109253066, + "grad_norm": 0.37005263566970825, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1640 + }, + { + "epoch": 1.839464882943144, + "grad_norm": 0.42875173687934875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 1650 + }, + { + "epoch": 1.850613154960981, + "grad_norm": 0.3646032512187958, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1660 + }, + { + "epoch": 1.8617614269788183, + "grad_norm": 0.38111618161201477, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1670 + }, + { + "epoch": 1.8729096989966556, + "grad_norm": 0.3825555443763733, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1680 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.36418095231056213, + "learning_rate": 0.0002, + "loss": 1.7599, + "step": 1690 + }, + { + "epoch": 1.89520624303233, + "grad_norm": 0.36551007628440857, + "learning_rate": 0.0002, + "loss": 1.6532, + "step": 1700 + }, + { + "epoch": 1.9063545150501673, + "grad_norm": 0.36421480774879456, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1710 + }, + { + "epoch": 1.9175027870680044, + "grad_norm": 0.3791242241859436, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 1720 + }, + { + "epoch": 1.9286510590858417, + "grad_norm": 0.36655193567276, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1730 + }, + { + "epoch": 1.939799331103679, + "grad_norm": 0.3526945412158966, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 1740 + }, + { + "epoch": 1.950947603121516, + "grad_norm": 0.41139861941337585, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 1750 + }, + { + "epoch": 1.9620958751393534, + "grad_norm": 0.41757065057754517, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1760 + }, + { + "epoch": 1.9732441471571907, + "grad_norm": 0.38956186175346375, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1770 + }, + { + "epoch": 1.9843924191750277, + "grad_norm": 0.33891627192497253, + "learning_rate": 0.0002, + "loss": 1.7653, + "step": 1780 + }, + { + "epoch": 1.9955406911928653, + "grad_norm": 0.42879191040992737, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 1790 + }, + { + "epoch": 2.0, + "eval_loss": 1.8116765022277832, + "eval_runtime": 37.9859, + "eval_samples_per_second": 13.558, + "eval_steps_per_second": 1.711, + "step": 1794 + }, + { + "epoch": 2.0066889632107023, + "grad_norm": 0.42103368043899536, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 1800 + }, + { + "epoch": 2.0178372352285394, + "grad_norm": 0.41505053639411926, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1810 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.398190438747406, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1820 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.4371621310710907, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1830 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.45679208636283875, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1840 + }, + { + "epoch": 2.0624303232998886, + "grad_norm": 0.43211811780929565, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 1850 + }, + { + "epoch": 2.0735785953177257, + "grad_norm": 0.47492915391921997, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1860 + }, + { + "epoch": 2.084726867335563, + "grad_norm": 0.41742339730262756, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1870 + }, + { + "epoch": 2.0958751393534003, + "grad_norm": 0.45789217948913574, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1880 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.43958935141563416, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1890 + }, + { + "epoch": 2.1181716833890745, + "grad_norm": 0.43991968035697937, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1900 + }, + { + "epoch": 2.129319955406912, + "grad_norm": 0.4667953848838806, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 1910 + }, + { + "epoch": 2.140468227424749, + "grad_norm": 0.42225760221481323, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1920 + }, + { + "epoch": 2.1516164994425866, + "grad_norm": 0.418850839138031, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1930 + }, + { + "epoch": 2.1627647714604237, + "grad_norm": 0.43838515877723694, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1940 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.43798115849494934, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 1950 + }, + { + "epoch": 2.1850613154960983, + "grad_norm": 0.4456610679626465, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1960 + }, + { + "epoch": 2.1962095875139354, + "grad_norm": 0.4619026482105255, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 1970 + }, + { + "epoch": 2.2073578595317724, + "grad_norm": 0.4732453525066376, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1980 + }, + { + "epoch": 2.21850613154961, + "grad_norm": 0.42551836371421814, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1990 + }, + { + "epoch": 2.229654403567447, + "grad_norm": 0.45154353976249695, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2000 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.4655696451663971, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 2010 + }, + { + "epoch": 2.2519509476031216, + "grad_norm": 0.5363447666168213, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 2020 + }, + { + "epoch": 2.2630992196209587, + "grad_norm": 0.4839927852153778, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2030 + }, + { + "epoch": 2.274247491638796, + "grad_norm": 0.4639221727848053, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 2040 + }, + { + "epoch": 2.2853957636566333, + "grad_norm": 0.46169278025627136, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2050 + }, + { + "epoch": 2.2965440356744704, + "grad_norm": 0.4582304060459137, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 2060 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.48619818687438965, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2070 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.4382200241088867, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 2080 + }, + { + "epoch": 2.329988851727982, + "grad_norm": 0.4103265106678009, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 2090 + }, + { + "epoch": 2.3411371237458196, + "grad_norm": 0.5136023759841919, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2100 + }, + { + "epoch": 2.3522853957636567, + "grad_norm": 0.46723702549934387, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 2110 + }, + { + "epoch": 2.3634336677814938, + "grad_norm": 0.42269468307495117, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 2120 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 0.42611163854599, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2130 + }, + { + "epoch": 2.3857302118171684, + "grad_norm": 0.4573901891708374, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 2140 + }, + { + "epoch": 2.3968784838350055, + "grad_norm": 0.4758673310279846, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 2150 + }, + { + "epoch": 2.408026755852843, + "grad_norm": 0.49616846442222595, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2160 + }, + { + "epoch": 2.41917502787068, + "grad_norm": 0.5278240442276001, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2170 + }, + { + "epoch": 2.430323299888517, + "grad_norm": 0.46806028485298157, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 2180 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 0.44507312774658203, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 2190 + }, + { + "epoch": 2.4526198439241917, + "grad_norm": 0.45716050267219543, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 2200 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.4226573705673218, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2210 + }, + { + "epoch": 2.4749163879598663, + "grad_norm": 0.4488418400287628, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2220 + }, + { + "epoch": 2.4860646599777034, + "grad_norm": 0.48324450850486755, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 2230 + }, + { + "epoch": 2.4972129319955405, + "grad_norm": 0.4866982400417328, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2240 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.4784172773361206, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 2250 + }, + { + "epoch": 2.519509476031215, + "grad_norm": 0.4250621199607849, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 2260 + }, + { + "epoch": 2.5306577480490526, + "grad_norm": 0.431224524974823, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 2270 + }, + { + "epoch": 2.5418060200668897, + "grad_norm": 0.3931371867656708, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2280 + }, + { + "epoch": 2.552954292084727, + "grad_norm": 0.4800887703895569, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 2290 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.4288487136363983, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2300 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.48489660024642944, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2310 + }, + { + "epoch": 2.5863991081382385, + "grad_norm": 0.4221740961074829, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 2320 + }, + { + "epoch": 2.597547380156076, + "grad_norm": 0.4413852393627167, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 2330 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.4391345679759979, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2340 + }, + { + "epoch": 2.61984392419175, + "grad_norm": 0.4824720323085785, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 2350 + }, + { + "epoch": 2.6309921962095872, + "grad_norm": 0.4023158550262451, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 2360 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 0.5107841491699219, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 2370 + }, + { + "epoch": 2.653288740245262, + "grad_norm": 0.4705312252044678, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 2380 + }, + { + "epoch": 2.6644370122630994, + "grad_norm": 0.4420899450778961, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 2390 + }, + { + "epoch": 2.6755852842809364, + "grad_norm": 0.413308709859848, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2400 + }, + { + "epoch": 2.6867335562987735, + "grad_norm": 0.4312658905982971, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2410 + }, + { + "epoch": 2.697881828316611, + "grad_norm": 0.44714513421058655, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2420 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.49152931571006775, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 2430 + }, + { + "epoch": 2.7201783723522857, + "grad_norm": 0.49458765983581543, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 2440 + }, + { + "epoch": 2.7313266443701227, + "grad_norm": 0.47838348150253296, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 2450 + }, + { + "epoch": 2.74247491638796, + "grad_norm": 0.5781240463256836, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 2460 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.4559851884841919, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2470 + }, + { + "epoch": 2.7647714604236344, + "grad_norm": 0.4452647566795349, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 2480 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.43920454382896423, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2490 + }, + { + "epoch": 2.787068004459309, + "grad_norm": 0.467780739068985, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 2500 + }, + { + "epoch": 2.798216276477146, + "grad_norm": 0.4743262529373169, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2510 + }, + { + "epoch": 2.809364548494983, + "grad_norm": 0.47944432497024536, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2520 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.48032790422439575, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 2530 + }, + { + "epoch": 2.831661092530658, + "grad_norm": 0.45569729804992676, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 2540 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.47940587997436523, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2550 + }, + { + "epoch": 2.8539576365663324, + "grad_norm": 0.5215432047843933, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 2560 + }, + { + "epoch": 2.8651059085841695, + "grad_norm": 0.4421178102493286, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 2570 + }, + { + "epoch": 2.8762541806020065, + "grad_norm": 0.45288747549057007, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 2580 + }, + { + "epoch": 2.887402452619844, + "grad_norm": 0.4472251832485199, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2590 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.4396503269672394, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2600 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.48590990900993347, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 2610 + }, + { + "epoch": 2.9208472686733558, + "grad_norm": 0.4787760376930237, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 2620 + }, + { + "epoch": 2.931995540691193, + "grad_norm": 0.4807611107826233, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2630 + }, + { + "epoch": 2.94314381270903, + "grad_norm": 0.4625583291053772, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 2.9542920847268674, + "grad_norm": 0.4163573980331421, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2650 + }, + { + "epoch": 2.9654403567447045, + "grad_norm": 0.5142832398414612, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 2660 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.4459492564201355, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 2670 + }, + { + "epoch": 2.987736900780379, + "grad_norm": 0.42905503511428833, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2680 + }, + { + "epoch": 2.998885172798216, + "grad_norm": 0.44594648480415344, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 2690 + }, + { + "epoch": 3.0, + "eval_loss": 1.8300215005874634, + "eval_runtime": 38.0349, + "eval_samples_per_second": 13.54, + "eval_steps_per_second": 1.709, + "step": 2691 + }, + { + "epoch": 3.0100334448160537, + "grad_norm": 0.4742245078086853, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2700 + }, + { + "epoch": 3.021181716833891, + "grad_norm": 0.5157448649406433, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 2710 + }, + { + "epoch": 3.032329988851728, + "grad_norm": 0.5634726285934448, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 2720 + }, + { + "epoch": 3.0434782608695654, + "grad_norm": 0.4554799199104309, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2730 + }, + { + "epoch": 3.0546265328874025, + "grad_norm": 0.6565208435058594, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 2740 + }, + { + "epoch": 3.0657748049052396, + "grad_norm": 0.6174370050430298, + "learning_rate": 0.0002, + "loss": 1.459, + "step": 2750 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.4987483024597168, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 2760 + }, + { + "epoch": 3.088071348940914, + "grad_norm": 0.5810927152633667, + "learning_rate": 0.0002, + "loss": 1.5466, + "step": 2770 + }, + { + "epoch": 3.0992196209587513, + "grad_norm": 0.5281634330749512, + "learning_rate": 0.0002, + "loss": 1.4936, + "step": 2780 + }, + { + "epoch": 3.1103678929765888, + "grad_norm": 0.5479053854942322, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 2790 + }, + { + "epoch": 3.121516164994426, + "grad_norm": 0.6192978620529175, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2800 + }, + { + "epoch": 3.132664437012263, + "grad_norm": 0.560117781162262, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2810 + }, + { + "epoch": 3.1438127090301005, + "grad_norm": 0.6067224740982056, + "learning_rate": 0.0002, + "loss": 1.5495, + "step": 2820 + }, + { + "epoch": 3.1549609810479375, + "grad_norm": 0.611287534236908, + "learning_rate": 0.0002, + "loss": 1.5239, + "step": 2830 + }, + { + "epoch": 3.1661092530657746, + "grad_norm": 0.6441587209701538, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 2840 + }, + { + "epoch": 3.177257525083612, + "grad_norm": 0.5955114364624023, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2850 + }, + { + "epoch": 3.1884057971014492, + "grad_norm": 0.5554782748222351, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 2860 + }, + { + "epoch": 3.1995540691192863, + "grad_norm": 0.5411370992660522, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 2870 + }, + { + "epoch": 3.210702341137124, + "grad_norm": 0.6152016520500183, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 2880 + }, + { + "epoch": 3.221850613154961, + "grad_norm": 0.5711581110954285, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2890 + }, + { + "epoch": 3.2329988851727984, + "grad_norm": 0.5399307012557983, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2900 + }, + { + "epoch": 3.2441471571906355, + "grad_norm": 0.60606849193573, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2910 + }, + { + "epoch": 3.2552954292084726, + "grad_norm": 0.5873523950576782, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2920 + }, + { + "epoch": 3.26644370122631, + "grad_norm": 0.6149439215660095, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 2930 + }, + { + "epoch": 3.277591973244147, + "grad_norm": 0.5940659046173096, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 2940 + }, + { + "epoch": 3.2887402452619843, + "grad_norm": 0.6846756339073181, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 2950 + }, + { + "epoch": 3.299888517279822, + "grad_norm": 0.6708254218101501, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2960 + }, + { + "epoch": 3.311036789297659, + "grad_norm": 0.5966503620147705, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 2970 + }, + { + "epoch": 3.322185061315496, + "grad_norm": 0.6328812837600708, + "learning_rate": 0.0002, + "loss": 1.5173, + "step": 2980 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6082745790481567, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 2990 + }, + { + "epoch": 3.3444816053511706, + "grad_norm": 0.6207539439201355, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 3000 + }, + { + "epoch": 3.3556298773690076, + "grad_norm": 0.5501793026924133, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 3010 + }, + { + "epoch": 3.366778149386845, + "grad_norm": 0.571275532245636, + "learning_rate": 0.0002, + "loss": 1.4428, + "step": 3020 + }, + { + "epoch": 3.3779264214046822, + "grad_norm": 0.7003518342971802, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 3030 + }, + { + "epoch": 3.3890746934225193, + "grad_norm": 0.609527587890625, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3040 + }, + { + "epoch": 3.400222965440357, + "grad_norm": 0.5880036354064941, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 3050 + }, + { + "epoch": 3.411371237458194, + "grad_norm": 0.5847334265708923, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 3060 + }, + { + "epoch": 3.4225195094760315, + "grad_norm": 0.5373924970626831, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 3070 + }, + { + "epoch": 3.4336677814938685, + "grad_norm": 0.6074833869934082, + "learning_rate": 0.0002, + "loss": 1.5215, + "step": 3080 + }, + { + "epoch": 3.4448160535117056, + "grad_norm": 0.5118414163589478, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 3090 + }, + { + "epoch": 3.4559643255295427, + "grad_norm": 0.5577956438064575, + "learning_rate": 0.0002, + "loss": 1.5006, + "step": 3100 + }, + { + "epoch": 3.46711259754738, + "grad_norm": 0.5654811859130859, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3110 + }, + { + "epoch": 3.4782608695652173, + "grad_norm": 0.6216017603874207, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 3120 + }, + { + "epoch": 3.489409141583055, + "grad_norm": 0.5983642339706421, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 3130 + }, + { + "epoch": 3.500557413600892, + "grad_norm": 0.6635708212852478, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3140 + }, + { + "epoch": 3.511705685618729, + "grad_norm": 0.6254258751869202, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 3150 + }, + { + "epoch": 3.522853957636566, + "grad_norm": 0.6359851360321045, + "learning_rate": 0.0002, + "loss": 1.4934, + "step": 3160 + }, + { + "epoch": 3.5340022296544036, + "grad_norm": 0.5938616394996643, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 3170 + }, + { + "epoch": 3.5451505016722407, + "grad_norm": 0.6360630393028259, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 3180 + }, + { + "epoch": 3.556298773690078, + "grad_norm": 0.6097670197486877, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 3190 + }, + { + "epoch": 3.5674470457079153, + "grad_norm": 0.5984025597572327, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3200 + }, + { + "epoch": 3.5785953177257523, + "grad_norm": 0.5463748574256897, + "learning_rate": 0.0002, + "loss": 1.4741, + "step": 3210 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 1.0017699003219604, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3220 + }, + { + "epoch": 3.600891861761427, + "grad_norm": 0.6519441604614258, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 3230 + }, + { + "epoch": 3.6120401337792645, + "grad_norm": 0.6457271575927734, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 3240 + }, + { + "epoch": 3.6231884057971016, + "grad_norm": 0.5898868441581726, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 3250 + }, + { + "epoch": 3.6343366778149386, + "grad_norm": 0.6612270474433899, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3260 + }, + { + "epoch": 3.6454849498327757, + "grad_norm": 0.5102090239524841, + "learning_rate": 0.0002, + "loss": 1.4537, + "step": 3270 + }, + { + "epoch": 3.6566332218506132, + "grad_norm": 0.5357231497764587, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 3280 + }, + { + "epoch": 3.6677814938684503, + "grad_norm": 0.6176130175590515, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3290 + }, + { + "epoch": 3.678929765886288, + "grad_norm": 0.6384354829788208, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3300 + }, + { + "epoch": 3.690078037904125, + "grad_norm": 0.5493269562721252, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3310 + }, + { + "epoch": 3.701226309921962, + "grad_norm": 0.5721797943115234, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 3320 + }, + { + "epoch": 3.712374581939799, + "grad_norm": 0.6667633056640625, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 3330 + }, + { + "epoch": 3.7235228539576366, + "grad_norm": 0.5713372826576233, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 3340 + }, + { + "epoch": 3.7346711259754737, + "grad_norm": 0.5925018191337585, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 3350 + }, + { + "epoch": 3.745819397993311, + "grad_norm": 0.5660955905914307, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 3360 + }, + { + "epoch": 3.7569676700111483, + "grad_norm": 0.5470759868621826, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 3370 + }, + { + "epoch": 3.7681159420289854, + "grad_norm": 0.7612935900688171, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3380 + }, + { + "epoch": 3.779264214046823, + "grad_norm": 0.577467679977417, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3390 + }, + { + "epoch": 3.79041248606466, + "grad_norm": 0.6125091910362244, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3400 + }, + { + "epoch": 3.801560758082497, + "grad_norm": 0.590386152267456, + "learning_rate": 0.0002, + "loss": 1.5463, + "step": 3410 + }, + { + "epoch": 3.8127090301003346, + "grad_norm": 0.5530361533164978, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 3420 + }, + { + "epoch": 3.8238573021181717, + "grad_norm": 0.5714079737663269, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 3430 + }, + { + "epoch": 3.8350055741360087, + "grad_norm": 0.9061086773872375, + "learning_rate": 0.0002, + "loss": 1.5324, + "step": 3440 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.6193320751190186, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3450 + }, + { + "epoch": 3.8573021181716833, + "grad_norm": 0.5831704139709473, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3460 + }, + { + "epoch": 3.868450390189521, + "grad_norm": 0.5971192717552185, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 3470 + }, + { + "epoch": 3.879598662207358, + "grad_norm": 0.6110154390335083, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 3480 + }, + { + "epoch": 3.890746934225195, + "grad_norm": 0.6644453406333923, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3490 + }, + { + "epoch": 3.901895206243032, + "grad_norm": 0.6674908399581909, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3500 + }, + { + "epoch": 3.9130434782608696, + "grad_norm": 0.5516519546508789, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3510 + }, + { + "epoch": 3.9241917502787067, + "grad_norm": 0.6704319715499878, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3520 + }, + { + "epoch": 3.9353400222965442, + "grad_norm": 0.5820314288139343, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3530 + }, + { + "epoch": 3.9464882943143813, + "grad_norm": 0.6931548714637756, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3540 + }, + { + "epoch": 3.9576365663322184, + "grad_norm": 0.6085171103477478, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 3550 + }, + { + "epoch": 3.9687848383500555, + "grad_norm": 0.5973535776138306, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3560 + }, + { + "epoch": 3.979933110367893, + "grad_norm": 0.49761658906936646, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3570 + }, + { + "epoch": 3.99108138238573, + "grad_norm": 0.6282512545585632, + "learning_rate": 0.0002, + "loss": 1.488, + "step": 3580 + }, + { + "epoch": 4.0, + "eval_loss": 1.8790398836135864, + "eval_runtime": 37.9725, + "eval_samples_per_second": 13.562, + "eval_steps_per_second": 1.712, + "step": 3588 + }, + { + "epoch": 4.002229654403568, + "grad_norm": 0.6402973532676697, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3590 + }, + { + "epoch": 4.013377926421405, + "grad_norm": 0.7791030406951904, + "learning_rate": 0.0002, + "loss": 1.3695, + "step": 3600 + }, + { + "epoch": 4.024526198439242, + "grad_norm": 0.7136624455451965, + "learning_rate": 0.0002, + "loss": 1.3545, + "step": 3610 + }, + { + "epoch": 4.035674470457079, + "grad_norm": 0.7608486413955688, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 3620 + }, + { + "epoch": 4.046822742474917, + "grad_norm": 0.7486591935157776, + "learning_rate": 0.0002, + "loss": 1.3067, + "step": 3630 + }, + { + "epoch": 4.057971014492754, + "grad_norm": 0.7576302289962769, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 3640 + }, + { + "epoch": 4.069119286510591, + "grad_norm": 0.7358254194259644, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3650 + }, + { + "epoch": 4.080267558528428, + "grad_norm": 0.821326494216919, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 3660 + }, + { + "epoch": 4.091415830546265, + "grad_norm": 0.7996482253074646, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 3670 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 0.8527022004127502, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3680 + }, + { + "epoch": 4.11371237458194, + "grad_norm": 0.7313576340675354, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 3690 + }, + { + "epoch": 4.124860646599777, + "grad_norm": 0.7854588627815247, + "learning_rate": 0.0002, + "loss": 1.3307, + "step": 3700 + }, + { + "epoch": 4.136008918617614, + "grad_norm": 0.6588303446769714, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 3710 + }, + { + "epoch": 4.147157190635451, + "grad_norm": 0.7986254692077637, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3720 + }, + { + "epoch": 4.1583054626532885, + "grad_norm": 0.6864156126976013, + "learning_rate": 0.0002, + "loss": 1.3505, + "step": 3730 + }, + { + "epoch": 4.169453734671126, + "grad_norm": 0.8197885155677795, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 3740 + }, + { + "epoch": 4.1806020066889635, + "grad_norm": 0.7169402837753296, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 3750 + }, + { + "epoch": 4.191750278706801, + "grad_norm": 0.7948839068412781, + "learning_rate": 0.0002, + "loss": 1.4388, + "step": 3760 + }, + { + "epoch": 4.202898550724638, + "grad_norm": 0.6775302290916443, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3770 + }, + { + "epoch": 4.214046822742475, + "grad_norm": 0.8913543820381165, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3780 + }, + { + "epoch": 4.225195094760312, + "grad_norm": 0.8046368360519409, + "learning_rate": 0.0002, + "loss": 1.4251, + "step": 3790 + }, + { + "epoch": 4.236343366778149, + "grad_norm": 0.9359563589096069, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3800 + }, + { + "epoch": 4.247491638795987, + "grad_norm": 0.8012228608131409, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 3810 + }, + { + "epoch": 4.258639910813824, + "grad_norm": 0.8405851125717163, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 3820 + }, + { + "epoch": 4.269788182831661, + "grad_norm": 0.7812899351119995, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 3830 + }, + { + "epoch": 4.280936454849498, + "grad_norm": 0.8192463517189026, + "learning_rate": 0.0002, + "loss": 1.4006, + "step": 3840 + }, + { + "epoch": 4.292084726867335, + "grad_norm": 0.6937220096588135, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 3850 + }, + { + "epoch": 4.303232998885173, + "grad_norm": 0.7245703935623169, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 3860 + }, + { + "epoch": 4.31438127090301, + "grad_norm": 0.7816787362098694, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 3870 + }, + { + "epoch": 4.325529542920847, + "grad_norm": 0.7904975414276123, + "learning_rate": 0.0002, + "loss": 1.4316, + "step": 3880 + }, + { + "epoch": 4.336677814938684, + "grad_norm": 1.0394847393035889, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 3890 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 0.7044078707695007, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3900 + }, + { + "epoch": 4.358974358974359, + "grad_norm": 0.8852819204330444, + "learning_rate": 0.0002, + "loss": 1.3185, + "step": 3910 + }, + { + "epoch": 4.3701226309921966, + "grad_norm": 0.7712758779525757, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3920 + }, + { + "epoch": 4.381270903010034, + "grad_norm": 0.7677774429321289, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 3930 + }, + { + "epoch": 4.392419175027871, + "grad_norm": 0.7450921535491943, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 3940 + }, + { + "epoch": 4.403567447045708, + "grad_norm": 0.7802795767784119, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 3950 + }, + { + "epoch": 4.414715719063545, + "grad_norm": 0.8976508378982544, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 3960 + }, + { + "epoch": 4.425863991081382, + "grad_norm": 0.8148922324180603, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3970 + }, + { + "epoch": 4.43701226309922, + "grad_norm": 0.7490504384040833, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 3980 + }, + { + "epoch": 4.448160535117057, + "grad_norm": 0.753652036190033, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 3990 + }, + { + "epoch": 4.459308807134894, + "grad_norm": 0.803986668586731, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4000 + }, + { + "epoch": 4.470457079152731, + "grad_norm": 0.8643081784248352, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 4010 + }, + { + "epoch": 4.481605351170568, + "grad_norm": 0.8298280835151672, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 4020 + }, + { + "epoch": 4.492753623188406, + "grad_norm": 0.705355703830719, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4030 + }, + { + "epoch": 4.503901895206243, + "grad_norm": 0.7845711708068848, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 4040 + }, + { + "epoch": 4.51505016722408, + "grad_norm": 0.8056256175041199, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 4050 + }, + { + "epoch": 4.5261984392419174, + "grad_norm": 0.7080171704292297, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4060 + }, + { + "epoch": 4.5373467112597545, + "grad_norm": 0.778388261795044, + "learning_rate": 0.0002, + "loss": 1.335, + "step": 4070 + }, + { + "epoch": 4.548494983277592, + "grad_norm": 0.7337639927864075, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 4080 + }, + { + "epoch": 4.55964325529543, + "grad_norm": 0.815322756767273, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4090 + }, + { + "epoch": 4.570791527313267, + "grad_norm": 0.8817179203033447, + "learning_rate": 0.0002, + "loss": 1.4509, + "step": 4100 + }, + { + "epoch": 4.581939799331104, + "grad_norm": 0.7526060342788696, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 4110 + }, + { + "epoch": 4.593088071348941, + "grad_norm": 0.920465350151062, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4120 + }, + { + "epoch": 4.604236343366778, + "grad_norm": 0.7509559392929077, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4130 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.799469530582428, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 4140 + }, + { + "epoch": 4.626532887402453, + "grad_norm": 0.8099892735481262, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4150 + }, + { + "epoch": 4.63768115942029, + "grad_norm": 0.7790375351905823, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4160 + }, + { + "epoch": 4.648829431438127, + "grad_norm": 0.8292977809906006, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 4170 + }, + { + "epoch": 4.659977703455964, + "grad_norm": 0.8312386274337769, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 4180 + }, + { + "epoch": 4.671125975473801, + "grad_norm": 0.7348753809928894, + "learning_rate": 0.0002, + "loss": 1.4301, + "step": 4190 + }, + { + "epoch": 4.682274247491639, + "grad_norm": 0.8006551265716553, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 4200 + }, + { + "epoch": 4.693422519509476, + "grad_norm": 0.8477752804756165, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4210 + }, + { + "epoch": 4.704570791527313, + "grad_norm": 0.7056546211242676, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 4220 + }, + { + "epoch": 4.7157190635451505, + "grad_norm": 0.7858873009681702, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 4230 + }, + { + "epoch": 4.7268673355629875, + "grad_norm": 0.6968740224838257, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 4240 + }, + { + "epoch": 4.738015607580825, + "grad_norm": 0.7886689901351929, + "learning_rate": 0.0002, + "loss": 1.3594, + "step": 4250 + }, + { + "epoch": 4.749163879598662, + "grad_norm": 0.8935304880142212, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4260 + }, + { + "epoch": 4.7603121516165, + "grad_norm": 0.8395553231239319, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 4270 + }, + { + "epoch": 4.771460423634337, + "grad_norm": 0.817263126373291, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4280 + }, + { + "epoch": 4.782608695652174, + "grad_norm": 0.7912008166313171, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 4290 + }, + { + "epoch": 4.793756967670011, + "grad_norm": 0.6637866497039795, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4300 + }, + { + "epoch": 4.804905239687848, + "grad_norm": 1.0709338188171387, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4310 + }, + { + "epoch": 4.816053511705686, + "grad_norm": 0.8179698586463928, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 4320 + }, + { + "epoch": 4.827201783723523, + "grad_norm": 0.7952052354812622, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4330 + }, + { + "epoch": 4.83835005574136, + "grad_norm": 0.7235367894172668, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 4340 + }, + { + "epoch": 4.849498327759197, + "grad_norm": 0.8484606742858887, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 4350 + }, + { + "epoch": 4.860646599777034, + "grad_norm": 0.7344942092895508, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 4360 + }, + { + "epoch": 4.871794871794872, + "grad_norm": 0.9718546867370605, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 4370 + }, + { + "epoch": 4.882943143812709, + "grad_norm": 0.8174259066581726, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 4380 + }, + { + "epoch": 4.894091415830546, + "grad_norm": 0.8097165822982788, + "learning_rate": 0.0002, + "loss": 1.3244, + "step": 4390 + }, + { + "epoch": 4.9052396878483835, + "grad_norm": 0.756388783454895, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4400 + }, + { + "epoch": 4.916387959866221, + "grad_norm": 0.8324617743492126, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 4410 + }, + { + "epoch": 4.927536231884058, + "grad_norm": 0.8949803709983826, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 4420 + }, + { + "epoch": 4.938684503901895, + "grad_norm": 0.7663722634315491, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4430 + }, + { + "epoch": 4.949832775919733, + "grad_norm": 0.7727946043014526, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 4440 + }, + { + "epoch": 4.96098104793757, + "grad_norm": 0.6872350573539734, + "learning_rate": 0.0002, + "loss": 1.4351, + "step": 4450 + }, + { + "epoch": 4.972129319955407, + "grad_norm": 0.754357099533081, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 4460 + }, + { + "epoch": 4.983277591973244, + "grad_norm": 0.8068729639053345, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 4470 + }, + { + "epoch": 4.994425863991081, + "grad_norm": 0.8200556635856628, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 4480 + }, + { + "epoch": 5.0, + "eval_loss": 1.9543706178665161, + "eval_runtime": 37.9369, + "eval_samples_per_second": 13.575, + "eval_steps_per_second": 1.713, + "step": 4485 + }, + { + "epoch": 5.005574136008919, + "grad_norm": 0.7499465942382812, + "learning_rate": 0.0002, + "loss": 1.3194, + "step": 4490 + }, + { + "epoch": 5.016722408026756, + "grad_norm": 1.030434489250183, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 4500 + }, + { + "epoch": 5.027870680044593, + "grad_norm": 0.8914631605148315, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 4510 + }, + { + "epoch": 5.03901895206243, + "grad_norm": 0.9902928471565247, + "learning_rate": 0.0002, + "loss": 1.1448, + "step": 4520 + }, + { + "epoch": 5.050167224080267, + "grad_norm": 0.8338701128959656, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 4530 + }, + { + "epoch": 5.061315496098104, + "grad_norm": 0.9440169334411621, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 4540 + }, + { + "epoch": 5.072463768115942, + "grad_norm": 0.8755099177360535, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 4550 + }, + { + "epoch": 5.083612040133779, + "grad_norm": 0.9145820140838623, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 4560 + }, + { + "epoch": 5.0947603121516165, + "grad_norm": 1.0068492889404297, + "learning_rate": 0.0002, + "loss": 1.147, + "step": 4570 + }, + { + "epoch": 5.105908584169454, + "grad_norm": 0.9184673428535461, + "learning_rate": 0.0002, + "loss": 1.2192, + "step": 4580 + }, + { + "epoch": 5.117056856187291, + "grad_norm": 1.1158655881881714, + "learning_rate": 0.0002, + "loss": 1.2948, + "step": 4590 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 0.9685078263282776, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 4600 + }, + { + "epoch": 5.139353400222966, + "grad_norm": 1.0389559268951416, + "learning_rate": 0.0002, + "loss": 1.2654, + "step": 4610 + }, + { + "epoch": 5.150501672240803, + "grad_norm": 1.0294485092163086, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4620 + }, + { + "epoch": 5.16164994425864, + "grad_norm": 0.9368783235549927, + "learning_rate": 0.0002, + "loss": 1.296, + "step": 4630 + }, + { + "epoch": 5.172798216276477, + "grad_norm": 0.9724945425987244, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 4640 + }, + { + "epoch": 5.183946488294314, + "grad_norm": 0.876488447189331, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 4650 + }, + { + "epoch": 5.195094760312152, + "grad_norm": 0.9106290340423584, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 4660 + }, + { + "epoch": 5.206243032329989, + "grad_norm": 1.0924615859985352, + "learning_rate": 0.0002, + "loss": 1.2896, + "step": 4670 + }, + { + "epoch": 5.217391304347826, + "grad_norm": 1.0379078388214111, + "learning_rate": 0.0002, + "loss": 1.245, + "step": 4680 + }, + { + "epoch": 5.228539576365663, + "grad_norm": 0.9507831931114197, + "learning_rate": 0.0002, + "loss": 1.2155, + "step": 4690 + }, + { + "epoch": 5.2396878483835, + "grad_norm": 1.0408620834350586, + "learning_rate": 0.0002, + "loss": 1.2318, + "step": 4700 + }, + { + "epoch": 5.250836120401337, + "grad_norm": 0.9463635087013245, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 4710 + }, + { + "epoch": 5.261984392419175, + "grad_norm": 0.8919326663017273, + "learning_rate": 0.0002, + "loss": 1.1951, + "step": 4720 + }, + { + "epoch": 5.2731326644370125, + "grad_norm": 1.0364950895309448, + "learning_rate": 0.0002, + "loss": 1.228, + "step": 4730 + }, + { + "epoch": 5.2842809364548495, + "grad_norm": 1.0225472450256348, + "learning_rate": 0.0002, + "loss": 1.2543, + "step": 4740 + }, + { + "epoch": 5.295429208472687, + "grad_norm": 0.816410481929779, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 4750 + }, + { + "epoch": 5.306577480490524, + "grad_norm": 1.0793992280960083, + "learning_rate": 0.0002, + "loss": 1.3601, + "step": 4760 + }, + { + "epoch": 5.317725752508361, + "grad_norm": 1.0203443765640259, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4770 + }, + { + "epoch": 5.328874024526199, + "grad_norm": 1.0731306076049805, + "learning_rate": 0.0002, + "loss": 1.239, + "step": 4780 + }, + { + "epoch": 5.340022296544036, + "grad_norm": 0.9282820224761963, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4790 + }, + { + "epoch": 5.351170568561873, + "grad_norm": 0.9741092920303345, + "learning_rate": 0.0002, + "loss": 1.2159, + "step": 4800 + }, + { + "epoch": 5.36231884057971, + "grad_norm": 1.0683609247207642, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 4810 + }, + { + "epoch": 5.373467112597547, + "grad_norm": 0.9035003781318665, + "learning_rate": 0.0002, + "loss": 1.2316, + "step": 4820 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 1.0590119361877441, + "learning_rate": 0.0002, + "loss": 1.2615, + "step": 4830 + }, + { + "epoch": 5.395763656633222, + "grad_norm": 0.9782686829566956, + "learning_rate": 0.0002, + "loss": 1.2089, + "step": 4840 + }, + { + "epoch": 5.406911928651059, + "grad_norm": 1.036087155342102, + "learning_rate": 0.0002, + "loss": 1.3019, + "step": 4850 + }, + { + "epoch": 5.418060200668896, + "grad_norm": 0.9999949932098389, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 4860 + }, + { + "epoch": 5.429208472686733, + "grad_norm": 0.9094445109367371, + "learning_rate": 0.0002, + "loss": 1.3014, + "step": 4870 + }, + { + "epoch": 5.44035674470457, + "grad_norm": 0.9079708456993103, + "learning_rate": 0.0002, + "loss": 1.2013, + "step": 4880 + }, + { + "epoch": 5.451505016722408, + "grad_norm": 1.0426156520843506, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 4890 + }, + { + "epoch": 5.4626532887402455, + "grad_norm": 1.0110737085342407, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 4900 + }, + { + "epoch": 5.4738015607580826, + "grad_norm": 1.0994000434875488, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 4910 + }, + { + "epoch": 5.48494983277592, + "grad_norm": 0.8988325595855713, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 4920 + }, + { + "epoch": 5.496098104793757, + "grad_norm": 1.0705887079238892, + "learning_rate": 0.0002, + "loss": 1.2694, + "step": 4930 + }, + { + "epoch": 5.507246376811594, + "grad_norm": 1.0268803834915161, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 4940 + }, + { + "epoch": 5.518394648829432, + "grad_norm": 1.0129153728485107, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 4950 + }, + { + "epoch": 5.529542920847269, + "grad_norm": 1.122117280960083, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 4960 + }, + { + "epoch": 5.540691192865106, + "grad_norm": 1.0318635702133179, + "learning_rate": 0.0002, + "loss": 1.2828, + "step": 4970 + }, + { + "epoch": 5.551839464882943, + "grad_norm": 0.9340117573738098, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4980 + }, + { + "epoch": 5.56298773690078, + "grad_norm": 0.9427006244659424, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 4990 + }, + { + "epoch": 5.574136008918618, + "grad_norm": 1.1786518096923828, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 5000 + }, + { + "epoch": 5.585284280936455, + "grad_norm": 1.045157551765442, + "learning_rate": 0.0002, + "loss": 1.2279, + "step": 5010 + }, + { + "epoch": 5.596432552954292, + "grad_norm": 1.0475151538848877, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 5020 + }, + { + "epoch": 5.607580824972129, + "grad_norm": 1.040969729423523, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 5030 + }, + { + "epoch": 5.618729096989966, + "grad_norm": 0.9610048532485962, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 5040 + }, + { + "epoch": 5.6298773690078034, + "grad_norm": 0.9774818420410156, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 5050 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 0.8715312480926514, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 5060 + }, + { + "epoch": 5.6521739130434785, + "grad_norm": 0.9484505653381348, + "learning_rate": 0.0002, + "loss": 1.3077, + "step": 5070 + }, + { + "epoch": 5.663322185061316, + "grad_norm": 0.8292845487594604, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 5080 + }, + { + "epoch": 5.674470457079153, + "grad_norm": 0.9876886606216431, + "learning_rate": 0.0002, + "loss": 1.2357, + "step": 5090 + }, + { + "epoch": 5.68561872909699, + "grad_norm": 0.9899171590805054, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 5100 + }, + { + "epoch": 5.696767001114827, + "grad_norm": 0.9693286418914795, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 5110 + }, + { + "epoch": 5.707915273132665, + "grad_norm": 0.958905816078186, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 5120 + }, + { + "epoch": 5.719063545150502, + "grad_norm": 0.9924837350845337, + "learning_rate": 0.0002, + "loss": 1.2889, + "step": 5130 + }, + { + "epoch": 5.730211817168339, + "grad_norm": 0.9551714062690735, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 5140 + }, + { + "epoch": 5.741360089186176, + "grad_norm": 1.0407027006149292, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 5150 + }, + { + "epoch": 5.752508361204013, + "grad_norm": 0.9688791036605835, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5160 + }, + { + "epoch": 5.763656633221851, + "grad_norm": 1.0091899633407593, + "learning_rate": 0.0002, + "loss": 1.1424, + "step": 5170 + }, + { + "epoch": 5.774804905239688, + "grad_norm": 0.9393984079360962, + "learning_rate": 0.0002, + "loss": 1.2575, + "step": 5180 + }, + { + "epoch": 5.785953177257525, + "grad_norm": 1.1439075469970703, + "learning_rate": 0.0002, + "loss": 1.2177, + "step": 5190 + }, + { + "epoch": 5.797101449275362, + "grad_norm": 1.0178622007369995, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 5200 + }, + { + "epoch": 5.808249721293199, + "grad_norm": 0.8440285921096802, + "learning_rate": 0.0002, + "loss": 1.3317, + "step": 5210 + }, + { + "epoch": 5.8193979933110365, + "grad_norm": 0.856838583946228, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 5220 + }, + { + "epoch": 5.8305462653288735, + "grad_norm": 0.8676707148551941, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 5230 + }, + { + "epoch": 5.8416945373467115, + "grad_norm": 1.1034743785858154, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 5240 + }, + { + "epoch": 5.852842809364549, + "grad_norm": 0.9631003737449646, + "learning_rate": 0.0002, + "loss": 1.2473, + "step": 5250 + }, + { + "epoch": 5.863991081382386, + "grad_norm": 1.0478793382644653, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 5260 + }, + { + "epoch": 5.875139353400223, + "grad_norm": 0.9819806218147278, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5270 + }, + { + "epoch": 5.88628762541806, + "grad_norm": 0.8572421073913574, + "learning_rate": 0.0002, + "loss": 1.2817, + "step": 5280 + }, + { + "epoch": 5.897435897435898, + "grad_norm": 0.9328814148902893, + "learning_rate": 0.0002, + "loss": 1.246, + "step": 5290 + }, + { + "epoch": 5.908584169453735, + "grad_norm": 1.000305414199829, + "learning_rate": 0.0002, + "loss": 1.3016, + "step": 5300 + }, + { + "epoch": 5.919732441471572, + "grad_norm": 1.1006377935409546, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 5310 + }, + { + "epoch": 5.930880713489409, + "grad_norm": 0.963198721408844, + "learning_rate": 0.0002, + "loss": 1.3317, + "step": 5320 + }, + { + "epoch": 5.942028985507246, + "grad_norm": 0.8952236175537109, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 5330 + }, + { + "epoch": 5.953177257525084, + "grad_norm": 1.0945496559143066, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 5340 + }, + { + "epoch": 5.964325529542921, + "grad_norm": 1.0053467750549316, + "learning_rate": 0.0002, + "loss": 1.2768, + "step": 5350 + }, + { + "epoch": 5.975473801560758, + "grad_norm": 1.032088279724121, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 5360 + }, + { + "epoch": 5.986622073578595, + "grad_norm": 1.1068958044052124, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 5370 + }, + { + "epoch": 5.997770345596432, + "grad_norm": 1.0064235925674438, + "learning_rate": 0.0002, + "loss": 1.2468, + "step": 5380 + }, + { + "epoch": 6.0, + "eval_loss": 2.0690135955810547, + "eval_runtime": 38.1748, + "eval_samples_per_second": 13.491, + "eval_steps_per_second": 1.703, + "step": 5382 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.490669092139172e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-5382/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fed4de83309dcadac32f4d4b2af91e948dc52f11 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c459bbd22e6bf66d101198706a8a3dee7c944fd08d3c2e1a9f92dcc7aebf85 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6573bec058dca47754c1053f829e31a16c84a58 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dde7faec48d0dcafcbb15996a6d4ff505dd32eed1d75aa4650f559a48df1257 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd404dae4f1e2be39bf9bf5bab3f94c132de0a8c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:110c063bef8a8e5f02bd40935fe43720b50c3aadade545a31e03bfebe4eeb394 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..619cc6026aff7c72bea1ae2f685c5cd14e1b7061 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9b0cc38405db08c46133cae9b0bf98ed32cd9c0d78dcbd04ff63606ca50423 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5a6793d525ae4efa0cdea24d992b9eb7a50d7914 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/trainer_state.json @@ -0,0 +1,4478 @@ +{ + "best_metric": 1.8116765022277832, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 6279, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + }, + { + "epoch": 1.0033444816053512, + "grad_norm": 0.29757317900657654, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 900 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.32168492674827576, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 910 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.3430717885494232, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 920 + }, + { + "epoch": 1.0367892976588629, + "grad_norm": 0.3431745767593384, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 930 + }, + { + "epoch": 1.0479375696767002, + "grad_norm": 0.39787548780441284, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 1.0590858416945372, + "grad_norm": 0.3540935218334198, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 950 + }, + { + "epoch": 1.0702341137123745, + "grad_norm": 0.368484765291214, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 960 + }, + { + "epoch": 1.0813823857302118, + "grad_norm": 0.41324466466903687, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 970 + }, + { + "epoch": 1.0925306577480491, + "grad_norm": 0.3696419596672058, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 980 + }, + { + "epoch": 1.1036789297658862, + "grad_norm": 0.33832886815071106, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 990 + }, + { + "epoch": 1.1148272017837235, + "grad_norm": 0.4411991834640503, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1000 + }, + { + "epoch": 1.1259754738015608, + "grad_norm": 0.3935333788394928, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 1010 + }, + { + "epoch": 1.137123745819398, + "grad_norm": 0.32472893595695496, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1020 + }, + { + "epoch": 1.1482720178372352, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 1030 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.3995654582977295, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 1040 + }, + { + "epoch": 1.1705685618729098, + "grad_norm": 0.384056031703949, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1050 + }, + { + "epoch": 1.1817168338907469, + "grad_norm": 0.4345705211162567, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1060 + }, + { + "epoch": 1.1928651059085842, + "grad_norm": 0.3524057865142822, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1070 + }, + { + "epoch": 1.2040133779264215, + "grad_norm": 0.4047132134437561, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 1080 + }, + { + "epoch": 1.2151616499442586, + "grad_norm": 0.365824431180954, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 1090 + }, + { + "epoch": 1.2263099219620959, + "grad_norm": 0.37048354744911194, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1100 + }, + { + "epoch": 1.2374581939799332, + "grad_norm": 0.3753672242164612, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 1110 + }, + { + "epoch": 1.2486064659977703, + "grad_norm": 0.37887042760849, + "learning_rate": 0.0002, + "loss": 1.6984, + "step": 1120 + }, + { + "epoch": 1.2597547380156076, + "grad_norm": 0.3896579444408417, + "learning_rate": 0.0002, + "loss": 1.7866, + "step": 1130 + }, + { + "epoch": 1.2709030100334449, + "grad_norm": 0.3725394010543823, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 1140 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.373989999294281, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1150 + }, + { + "epoch": 1.2931995540691192, + "grad_norm": 0.4412260353565216, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1160 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.38538658618927, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1170 + }, + { + "epoch": 1.3154960981047936, + "grad_norm": 0.3644104599952698, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1180 + }, + { + "epoch": 1.326644370122631, + "grad_norm": 0.3615347743034363, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1190 + }, + { + "epoch": 1.3377926421404682, + "grad_norm": 0.4260489046573639, + "learning_rate": 0.0002, + "loss": 1.7575, + "step": 1200 + }, + { + "epoch": 1.3489409141583055, + "grad_norm": 0.35236871242523193, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1210 + }, + { + "epoch": 1.3600891861761428, + "grad_norm": 0.45456627011299133, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1220 + }, + { + "epoch": 1.37123745819398, + "grad_norm": 0.391541063785553, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 1230 + }, + { + "epoch": 1.3823857302118172, + "grad_norm": 0.37955328822135925, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1240 + }, + { + "epoch": 1.3935340022296545, + "grad_norm": 0.36955225467681885, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 1250 + }, + { + "epoch": 1.4046822742474916, + "grad_norm": 0.36156216263771057, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 1260 + }, + { + "epoch": 1.415830546265329, + "grad_norm": 0.4083487391471863, + "learning_rate": 0.0002, + "loss": 1.8091, + "step": 1270 + }, + { + "epoch": 1.4269788182831662, + "grad_norm": 0.420171320438385, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1280 + }, + { + "epoch": 1.4381270903010033, + "grad_norm": 0.3581725060939789, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1290 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.3657953441143036, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1300 + }, + { + "epoch": 1.4604236343366779, + "grad_norm": 0.3139931857585907, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 1310 + }, + { + "epoch": 1.471571906354515, + "grad_norm": 0.37750574946403503, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 1320 + }, + { + "epoch": 1.4827201783723523, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1330 + }, + { + "epoch": 1.4938684503901896, + "grad_norm": 0.39505279064178467, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1340 + }, + { + "epoch": 1.5050167224080266, + "grad_norm": 0.39977672696113586, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 1350 + }, + { + "epoch": 1.516164994425864, + "grad_norm": 0.4395383298397064, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 1.5273132664437012, + "grad_norm": 0.3452998995780945, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 1370 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.39573904871940613, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1380 + }, + { + "epoch": 1.5496098104793758, + "grad_norm": 0.4886358976364136, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1390 + }, + { + "epoch": 1.560758082497213, + "grad_norm": 0.35525891184806824, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 1400 + }, + { + "epoch": 1.57190635451505, + "grad_norm": 0.3873274028301239, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1410 + }, + { + "epoch": 1.5830546265328875, + "grad_norm": 0.35162487626075745, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 1420 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.3533175587654114, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 1430 + }, + { + "epoch": 1.605351170568562, + "grad_norm": 0.35397887229919434, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 1440 + }, + { + "epoch": 1.6164994425863992, + "grad_norm": 0.3539091646671295, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 1450 + }, + { + "epoch": 1.6276477146042363, + "grad_norm": 0.38557013869285583, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 1460 + }, + { + "epoch": 1.6387959866220736, + "grad_norm": 0.3591409921646118, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1470 + }, + { + "epoch": 1.649944258639911, + "grad_norm": 0.3776722848415375, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1480 + }, + { + "epoch": 1.661092530657748, + "grad_norm": 0.3761521875858307, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1490 + }, + { + "epoch": 1.6722408026755853, + "grad_norm": 0.33939364552497864, + "learning_rate": 0.0002, + "loss": 1.7464, + "step": 1500 + }, + { + "epoch": 1.6833890746934226, + "grad_norm": 0.3961067795753479, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 1510 + }, + { + "epoch": 1.6945373467112597, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1520 + }, + { + "epoch": 1.705685618729097, + "grad_norm": 0.4201025068759918, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1530 + }, + { + "epoch": 1.7168338907469343, + "grad_norm": 0.382280558347702, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 1540 + }, + { + "epoch": 1.7279821627647713, + "grad_norm": 0.4504372477531433, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 1550 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.36121585965156555, + "learning_rate": 0.0002, + "loss": 1.7889, + "step": 1560 + }, + { + "epoch": 1.750278706800446, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 1570 + }, + { + "epoch": 1.761426978818283, + "grad_norm": 0.3920411467552185, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 1580 + }, + { + "epoch": 1.7725752508361206, + "grad_norm": 0.4326777756214142, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1590 + }, + { + "epoch": 1.7837235228539576, + "grad_norm": 0.3582489490509033, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1600 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.36345767974853516, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 1610 + }, + { + "epoch": 1.8060200668896322, + "grad_norm": 0.3951990008354187, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1620 + }, + { + "epoch": 1.8171683389074693, + "grad_norm": 0.35174235701560974, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 1630 + }, + { + "epoch": 1.8283166109253066, + "grad_norm": 0.37005263566970825, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1640 + }, + { + "epoch": 1.839464882943144, + "grad_norm": 0.42875173687934875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 1650 + }, + { + "epoch": 1.850613154960981, + "grad_norm": 0.3646032512187958, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1660 + }, + { + "epoch": 1.8617614269788183, + "grad_norm": 0.38111618161201477, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1670 + }, + { + "epoch": 1.8729096989966556, + "grad_norm": 0.3825555443763733, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1680 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.36418095231056213, + "learning_rate": 0.0002, + "loss": 1.7599, + "step": 1690 + }, + { + "epoch": 1.89520624303233, + "grad_norm": 0.36551007628440857, + "learning_rate": 0.0002, + "loss": 1.6532, + "step": 1700 + }, + { + "epoch": 1.9063545150501673, + "grad_norm": 0.36421480774879456, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1710 + }, + { + "epoch": 1.9175027870680044, + "grad_norm": 0.3791242241859436, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 1720 + }, + { + "epoch": 1.9286510590858417, + "grad_norm": 0.36655193567276, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1730 + }, + { + "epoch": 1.939799331103679, + "grad_norm": 0.3526945412158966, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 1740 + }, + { + "epoch": 1.950947603121516, + "grad_norm": 0.41139861941337585, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 1750 + }, + { + "epoch": 1.9620958751393534, + "grad_norm": 0.41757065057754517, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1760 + }, + { + "epoch": 1.9732441471571907, + "grad_norm": 0.38956186175346375, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1770 + }, + { + "epoch": 1.9843924191750277, + "grad_norm": 0.33891627192497253, + "learning_rate": 0.0002, + "loss": 1.7653, + "step": 1780 + }, + { + "epoch": 1.9955406911928653, + "grad_norm": 0.42879191040992737, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 1790 + }, + { + "epoch": 2.0, + "eval_loss": 1.8116765022277832, + "eval_runtime": 37.9859, + "eval_samples_per_second": 13.558, + "eval_steps_per_second": 1.711, + "step": 1794 + }, + { + "epoch": 2.0066889632107023, + "grad_norm": 0.42103368043899536, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 1800 + }, + { + "epoch": 2.0178372352285394, + "grad_norm": 0.41505053639411926, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1810 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.398190438747406, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1820 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.4371621310710907, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1830 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.45679208636283875, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1840 + }, + { + "epoch": 2.0624303232998886, + "grad_norm": 0.43211811780929565, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 1850 + }, + { + "epoch": 2.0735785953177257, + "grad_norm": 0.47492915391921997, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1860 + }, + { + "epoch": 2.084726867335563, + "grad_norm": 0.41742339730262756, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1870 + }, + { + "epoch": 2.0958751393534003, + "grad_norm": 0.45789217948913574, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1880 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.43958935141563416, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1890 + }, + { + "epoch": 2.1181716833890745, + "grad_norm": 0.43991968035697937, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1900 + }, + { + "epoch": 2.129319955406912, + "grad_norm": 0.4667953848838806, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 1910 + }, + { + "epoch": 2.140468227424749, + "grad_norm": 0.42225760221481323, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1920 + }, + { + "epoch": 2.1516164994425866, + "grad_norm": 0.418850839138031, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1930 + }, + { + "epoch": 2.1627647714604237, + "grad_norm": 0.43838515877723694, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1940 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.43798115849494934, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 1950 + }, + { + "epoch": 2.1850613154960983, + "grad_norm": 0.4456610679626465, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1960 + }, + { + "epoch": 2.1962095875139354, + "grad_norm": 0.4619026482105255, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 1970 + }, + { + "epoch": 2.2073578595317724, + "grad_norm": 0.4732453525066376, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1980 + }, + { + "epoch": 2.21850613154961, + "grad_norm": 0.42551836371421814, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1990 + }, + { + "epoch": 2.229654403567447, + "grad_norm": 0.45154353976249695, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2000 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.4655696451663971, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 2010 + }, + { + "epoch": 2.2519509476031216, + "grad_norm": 0.5363447666168213, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 2020 + }, + { + "epoch": 2.2630992196209587, + "grad_norm": 0.4839927852153778, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2030 + }, + { + "epoch": 2.274247491638796, + "grad_norm": 0.4639221727848053, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 2040 + }, + { + "epoch": 2.2853957636566333, + "grad_norm": 0.46169278025627136, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2050 + }, + { + "epoch": 2.2965440356744704, + "grad_norm": 0.4582304060459137, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 2060 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.48619818687438965, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2070 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.4382200241088867, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 2080 + }, + { + "epoch": 2.329988851727982, + "grad_norm": 0.4103265106678009, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 2090 + }, + { + "epoch": 2.3411371237458196, + "grad_norm": 0.5136023759841919, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2100 + }, + { + "epoch": 2.3522853957636567, + "grad_norm": 0.46723702549934387, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 2110 + }, + { + "epoch": 2.3634336677814938, + "grad_norm": 0.42269468307495117, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 2120 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 0.42611163854599, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2130 + }, + { + "epoch": 2.3857302118171684, + "grad_norm": 0.4573901891708374, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 2140 + }, + { + "epoch": 2.3968784838350055, + "grad_norm": 0.4758673310279846, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 2150 + }, + { + "epoch": 2.408026755852843, + "grad_norm": 0.49616846442222595, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2160 + }, + { + "epoch": 2.41917502787068, + "grad_norm": 0.5278240442276001, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2170 + }, + { + "epoch": 2.430323299888517, + "grad_norm": 0.46806028485298157, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 2180 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 0.44507312774658203, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 2190 + }, + { + "epoch": 2.4526198439241917, + "grad_norm": 0.45716050267219543, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 2200 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.4226573705673218, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2210 + }, + { + "epoch": 2.4749163879598663, + "grad_norm": 0.4488418400287628, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2220 + }, + { + "epoch": 2.4860646599777034, + "grad_norm": 0.48324450850486755, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 2230 + }, + { + "epoch": 2.4972129319955405, + "grad_norm": 0.4866982400417328, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2240 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.4784172773361206, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 2250 + }, + { + "epoch": 2.519509476031215, + "grad_norm": 0.4250621199607849, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 2260 + }, + { + "epoch": 2.5306577480490526, + "grad_norm": 0.431224524974823, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 2270 + }, + { + "epoch": 2.5418060200668897, + "grad_norm": 0.3931371867656708, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2280 + }, + { + "epoch": 2.552954292084727, + "grad_norm": 0.4800887703895569, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 2290 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.4288487136363983, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2300 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.48489660024642944, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2310 + }, + { + "epoch": 2.5863991081382385, + "grad_norm": 0.4221740961074829, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 2320 + }, + { + "epoch": 2.597547380156076, + "grad_norm": 0.4413852393627167, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 2330 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.4391345679759979, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2340 + }, + { + "epoch": 2.61984392419175, + "grad_norm": 0.4824720323085785, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 2350 + }, + { + "epoch": 2.6309921962095872, + "grad_norm": 0.4023158550262451, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 2360 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 0.5107841491699219, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 2370 + }, + { + "epoch": 2.653288740245262, + "grad_norm": 0.4705312252044678, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 2380 + }, + { + "epoch": 2.6644370122630994, + "grad_norm": 0.4420899450778961, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 2390 + }, + { + "epoch": 2.6755852842809364, + "grad_norm": 0.413308709859848, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2400 + }, + { + "epoch": 2.6867335562987735, + "grad_norm": 0.4312658905982971, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2410 + }, + { + "epoch": 2.697881828316611, + "grad_norm": 0.44714513421058655, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2420 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.49152931571006775, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 2430 + }, + { + "epoch": 2.7201783723522857, + "grad_norm": 0.49458765983581543, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 2440 + }, + { + "epoch": 2.7313266443701227, + "grad_norm": 0.47838348150253296, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 2450 + }, + { + "epoch": 2.74247491638796, + "grad_norm": 0.5781240463256836, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 2460 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.4559851884841919, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2470 + }, + { + "epoch": 2.7647714604236344, + "grad_norm": 0.4452647566795349, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 2480 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.43920454382896423, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2490 + }, + { + "epoch": 2.787068004459309, + "grad_norm": 0.467780739068985, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 2500 + }, + { + "epoch": 2.798216276477146, + "grad_norm": 0.4743262529373169, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2510 + }, + { + "epoch": 2.809364548494983, + "grad_norm": 0.47944432497024536, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2520 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.48032790422439575, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 2530 + }, + { + "epoch": 2.831661092530658, + "grad_norm": 0.45569729804992676, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 2540 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.47940587997436523, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2550 + }, + { + "epoch": 2.8539576365663324, + "grad_norm": 0.5215432047843933, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 2560 + }, + { + "epoch": 2.8651059085841695, + "grad_norm": 0.4421178102493286, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 2570 + }, + { + "epoch": 2.8762541806020065, + "grad_norm": 0.45288747549057007, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 2580 + }, + { + "epoch": 2.887402452619844, + "grad_norm": 0.4472251832485199, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2590 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.4396503269672394, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2600 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.48590990900993347, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 2610 + }, + { + "epoch": 2.9208472686733558, + "grad_norm": 0.4787760376930237, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 2620 + }, + { + "epoch": 2.931995540691193, + "grad_norm": 0.4807611107826233, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2630 + }, + { + "epoch": 2.94314381270903, + "grad_norm": 0.4625583291053772, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 2.9542920847268674, + "grad_norm": 0.4163573980331421, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2650 + }, + { + "epoch": 2.9654403567447045, + "grad_norm": 0.5142832398414612, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 2660 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.4459492564201355, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 2670 + }, + { + "epoch": 2.987736900780379, + "grad_norm": 0.42905503511428833, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2680 + }, + { + "epoch": 2.998885172798216, + "grad_norm": 0.44594648480415344, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 2690 + }, + { + "epoch": 3.0, + "eval_loss": 1.8300215005874634, + "eval_runtime": 38.0349, + "eval_samples_per_second": 13.54, + "eval_steps_per_second": 1.709, + "step": 2691 + }, + { + "epoch": 3.0100334448160537, + "grad_norm": 0.4742245078086853, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2700 + }, + { + "epoch": 3.021181716833891, + "grad_norm": 0.5157448649406433, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 2710 + }, + { + "epoch": 3.032329988851728, + "grad_norm": 0.5634726285934448, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 2720 + }, + { + "epoch": 3.0434782608695654, + "grad_norm": 0.4554799199104309, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2730 + }, + { + "epoch": 3.0546265328874025, + "grad_norm": 0.6565208435058594, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 2740 + }, + { + "epoch": 3.0657748049052396, + "grad_norm": 0.6174370050430298, + "learning_rate": 0.0002, + "loss": 1.459, + "step": 2750 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.4987483024597168, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 2760 + }, + { + "epoch": 3.088071348940914, + "grad_norm": 0.5810927152633667, + "learning_rate": 0.0002, + "loss": 1.5466, + "step": 2770 + }, + { + "epoch": 3.0992196209587513, + "grad_norm": 0.5281634330749512, + "learning_rate": 0.0002, + "loss": 1.4936, + "step": 2780 + }, + { + "epoch": 3.1103678929765888, + "grad_norm": 0.5479053854942322, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 2790 + }, + { + "epoch": 3.121516164994426, + "grad_norm": 0.6192978620529175, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2800 + }, + { + "epoch": 3.132664437012263, + "grad_norm": 0.560117781162262, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2810 + }, + { + "epoch": 3.1438127090301005, + "grad_norm": 0.6067224740982056, + "learning_rate": 0.0002, + "loss": 1.5495, + "step": 2820 + }, + { + "epoch": 3.1549609810479375, + "grad_norm": 0.611287534236908, + "learning_rate": 0.0002, + "loss": 1.5239, + "step": 2830 + }, + { + "epoch": 3.1661092530657746, + "grad_norm": 0.6441587209701538, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 2840 + }, + { + "epoch": 3.177257525083612, + "grad_norm": 0.5955114364624023, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2850 + }, + { + "epoch": 3.1884057971014492, + "grad_norm": 0.5554782748222351, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 2860 + }, + { + "epoch": 3.1995540691192863, + "grad_norm": 0.5411370992660522, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 2870 + }, + { + "epoch": 3.210702341137124, + "grad_norm": 0.6152016520500183, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 2880 + }, + { + "epoch": 3.221850613154961, + "grad_norm": 0.5711581110954285, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2890 + }, + { + "epoch": 3.2329988851727984, + "grad_norm": 0.5399307012557983, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2900 + }, + { + "epoch": 3.2441471571906355, + "grad_norm": 0.60606849193573, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2910 + }, + { + "epoch": 3.2552954292084726, + "grad_norm": 0.5873523950576782, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2920 + }, + { + "epoch": 3.26644370122631, + "grad_norm": 0.6149439215660095, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 2930 + }, + { + "epoch": 3.277591973244147, + "grad_norm": 0.5940659046173096, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 2940 + }, + { + "epoch": 3.2887402452619843, + "grad_norm": 0.6846756339073181, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 2950 + }, + { + "epoch": 3.299888517279822, + "grad_norm": 0.6708254218101501, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2960 + }, + { + "epoch": 3.311036789297659, + "grad_norm": 0.5966503620147705, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 2970 + }, + { + "epoch": 3.322185061315496, + "grad_norm": 0.6328812837600708, + "learning_rate": 0.0002, + "loss": 1.5173, + "step": 2980 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6082745790481567, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 2990 + }, + { + "epoch": 3.3444816053511706, + "grad_norm": 0.6207539439201355, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 3000 + }, + { + "epoch": 3.3556298773690076, + "grad_norm": 0.5501793026924133, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 3010 + }, + { + "epoch": 3.366778149386845, + "grad_norm": 0.571275532245636, + "learning_rate": 0.0002, + "loss": 1.4428, + "step": 3020 + }, + { + "epoch": 3.3779264214046822, + "grad_norm": 0.7003518342971802, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 3030 + }, + { + "epoch": 3.3890746934225193, + "grad_norm": 0.609527587890625, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3040 + }, + { + "epoch": 3.400222965440357, + "grad_norm": 0.5880036354064941, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 3050 + }, + { + "epoch": 3.411371237458194, + "grad_norm": 0.5847334265708923, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 3060 + }, + { + "epoch": 3.4225195094760315, + "grad_norm": 0.5373924970626831, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 3070 + }, + { + "epoch": 3.4336677814938685, + "grad_norm": 0.6074833869934082, + "learning_rate": 0.0002, + "loss": 1.5215, + "step": 3080 + }, + { + "epoch": 3.4448160535117056, + "grad_norm": 0.5118414163589478, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 3090 + }, + { + "epoch": 3.4559643255295427, + "grad_norm": 0.5577956438064575, + "learning_rate": 0.0002, + "loss": 1.5006, + "step": 3100 + }, + { + "epoch": 3.46711259754738, + "grad_norm": 0.5654811859130859, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3110 + }, + { + "epoch": 3.4782608695652173, + "grad_norm": 0.6216017603874207, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 3120 + }, + { + "epoch": 3.489409141583055, + "grad_norm": 0.5983642339706421, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 3130 + }, + { + "epoch": 3.500557413600892, + "grad_norm": 0.6635708212852478, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3140 + }, + { + "epoch": 3.511705685618729, + "grad_norm": 0.6254258751869202, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 3150 + }, + { + "epoch": 3.522853957636566, + "grad_norm": 0.6359851360321045, + "learning_rate": 0.0002, + "loss": 1.4934, + "step": 3160 + }, + { + "epoch": 3.5340022296544036, + "grad_norm": 0.5938616394996643, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 3170 + }, + { + "epoch": 3.5451505016722407, + "grad_norm": 0.6360630393028259, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 3180 + }, + { + "epoch": 3.556298773690078, + "grad_norm": 0.6097670197486877, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 3190 + }, + { + "epoch": 3.5674470457079153, + "grad_norm": 0.5984025597572327, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3200 + }, + { + "epoch": 3.5785953177257523, + "grad_norm": 0.5463748574256897, + "learning_rate": 0.0002, + "loss": 1.4741, + "step": 3210 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 1.0017699003219604, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3220 + }, + { + "epoch": 3.600891861761427, + "grad_norm": 0.6519441604614258, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 3230 + }, + { + "epoch": 3.6120401337792645, + "grad_norm": 0.6457271575927734, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 3240 + }, + { + "epoch": 3.6231884057971016, + "grad_norm": 0.5898868441581726, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 3250 + }, + { + "epoch": 3.6343366778149386, + "grad_norm": 0.6612270474433899, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3260 + }, + { + "epoch": 3.6454849498327757, + "grad_norm": 0.5102090239524841, + "learning_rate": 0.0002, + "loss": 1.4537, + "step": 3270 + }, + { + "epoch": 3.6566332218506132, + "grad_norm": 0.5357231497764587, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 3280 + }, + { + "epoch": 3.6677814938684503, + "grad_norm": 0.6176130175590515, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3290 + }, + { + "epoch": 3.678929765886288, + "grad_norm": 0.6384354829788208, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3300 + }, + { + "epoch": 3.690078037904125, + "grad_norm": 0.5493269562721252, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3310 + }, + { + "epoch": 3.701226309921962, + "grad_norm": 0.5721797943115234, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 3320 + }, + { + "epoch": 3.712374581939799, + "grad_norm": 0.6667633056640625, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 3330 + }, + { + "epoch": 3.7235228539576366, + "grad_norm": 0.5713372826576233, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 3340 + }, + { + "epoch": 3.7346711259754737, + "grad_norm": 0.5925018191337585, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 3350 + }, + { + "epoch": 3.745819397993311, + "grad_norm": 0.5660955905914307, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 3360 + }, + { + "epoch": 3.7569676700111483, + "grad_norm": 0.5470759868621826, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 3370 + }, + { + "epoch": 3.7681159420289854, + "grad_norm": 0.7612935900688171, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3380 + }, + { + "epoch": 3.779264214046823, + "grad_norm": 0.577467679977417, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3390 + }, + { + "epoch": 3.79041248606466, + "grad_norm": 0.6125091910362244, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3400 + }, + { + "epoch": 3.801560758082497, + "grad_norm": 0.590386152267456, + "learning_rate": 0.0002, + "loss": 1.5463, + "step": 3410 + }, + { + "epoch": 3.8127090301003346, + "grad_norm": 0.5530361533164978, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 3420 + }, + { + "epoch": 3.8238573021181717, + "grad_norm": 0.5714079737663269, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 3430 + }, + { + "epoch": 3.8350055741360087, + "grad_norm": 0.9061086773872375, + "learning_rate": 0.0002, + "loss": 1.5324, + "step": 3440 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.6193320751190186, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3450 + }, + { + "epoch": 3.8573021181716833, + "grad_norm": 0.5831704139709473, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3460 + }, + { + "epoch": 3.868450390189521, + "grad_norm": 0.5971192717552185, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 3470 + }, + { + "epoch": 3.879598662207358, + "grad_norm": 0.6110154390335083, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 3480 + }, + { + "epoch": 3.890746934225195, + "grad_norm": 0.6644453406333923, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3490 + }, + { + "epoch": 3.901895206243032, + "grad_norm": 0.6674908399581909, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3500 + }, + { + "epoch": 3.9130434782608696, + "grad_norm": 0.5516519546508789, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3510 + }, + { + "epoch": 3.9241917502787067, + "grad_norm": 0.6704319715499878, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3520 + }, + { + "epoch": 3.9353400222965442, + "grad_norm": 0.5820314288139343, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3530 + }, + { + "epoch": 3.9464882943143813, + "grad_norm": 0.6931548714637756, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3540 + }, + { + "epoch": 3.9576365663322184, + "grad_norm": 0.6085171103477478, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 3550 + }, + { + "epoch": 3.9687848383500555, + "grad_norm": 0.5973535776138306, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3560 + }, + { + "epoch": 3.979933110367893, + "grad_norm": 0.49761658906936646, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3570 + }, + { + "epoch": 3.99108138238573, + "grad_norm": 0.6282512545585632, + "learning_rate": 0.0002, + "loss": 1.488, + "step": 3580 + }, + { + "epoch": 4.0, + "eval_loss": 1.8790398836135864, + "eval_runtime": 37.9725, + "eval_samples_per_second": 13.562, + "eval_steps_per_second": 1.712, + "step": 3588 + }, + { + "epoch": 4.002229654403568, + "grad_norm": 0.6402973532676697, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3590 + }, + { + "epoch": 4.013377926421405, + "grad_norm": 0.7791030406951904, + "learning_rate": 0.0002, + "loss": 1.3695, + "step": 3600 + }, + { + "epoch": 4.024526198439242, + "grad_norm": 0.7136624455451965, + "learning_rate": 0.0002, + "loss": 1.3545, + "step": 3610 + }, + { + "epoch": 4.035674470457079, + "grad_norm": 0.7608486413955688, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 3620 + }, + { + "epoch": 4.046822742474917, + "grad_norm": 0.7486591935157776, + "learning_rate": 0.0002, + "loss": 1.3067, + "step": 3630 + }, + { + "epoch": 4.057971014492754, + "grad_norm": 0.7576302289962769, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 3640 + }, + { + "epoch": 4.069119286510591, + "grad_norm": 0.7358254194259644, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3650 + }, + { + "epoch": 4.080267558528428, + "grad_norm": 0.821326494216919, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 3660 + }, + { + "epoch": 4.091415830546265, + "grad_norm": 0.7996482253074646, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 3670 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 0.8527022004127502, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3680 + }, + { + "epoch": 4.11371237458194, + "grad_norm": 0.7313576340675354, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 3690 + }, + { + "epoch": 4.124860646599777, + "grad_norm": 0.7854588627815247, + "learning_rate": 0.0002, + "loss": 1.3307, + "step": 3700 + }, + { + "epoch": 4.136008918617614, + "grad_norm": 0.6588303446769714, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 3710 + }, + { + "epoch": 4.147157190635451, + "grad_norm": 0.7986254692077637, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3720 + }, + { + "epoch": 4.1583054626532885, + "grad_norm": 0.6864156126976013, + "learning_rate": 0.0002, + "loss": 1.3505, + "step": 3730 + }, + { + "epoch": 4.169453734671126, + "grad_norm": 0.8197885155677795, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 3740 + }, + { + "epoch": 4.1806020066889635, + "grad_norm": 0.7169402837753296, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 3750 + }, + { + "epoch": 4.191750278706801, + "grad_norm": 0.7948839068412781, + "learning_rate": 0.0002, + "loss": 1.4388, + "step": 3760 + }, + { + "epoch": 4.202898550724638, + "grad_norm": 0.6775302290916443, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3770 + }, + { + "epoch": 4.214046822742475, + "grad_norm": 0.8913543820381165, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3780 + }, + { + "epoch": 4.225195094760312, + "grad_norm": 0.8046368360519409, + "learning_rate": 0.0002, + "loss": 1.4251, + "step": 3790 + }, + { + "epoch": 4.236343366778149, + "grad_norm": 0.9359563589096069, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3800 + }, + { + "epoch": 4.247491638795987, + "grad_norm": 0.8012228608131409, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 3810 + }, + { + "epoch": 4.258639910813824, + "grad_norm": 0.8405851125717163, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 3820 + }, + { + "epoch": 4.269788182831661, + "grad_norm": 0.7812899351119995, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 3830 + }, + { + "epoch": 4.280936454849498, + "grad_norm": 0.8192463517189026, + "learning_rate": 0.0002, + "loss": 1.4006, + "step": 3840 + }, + { + "epoch": 4.292084726867335, + "grad_norm": 0.6937220096588135, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 3850 + }, + { + "epoch": 4.303232998885173, + "grad_norm": 0.7245703935623169, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 3860 + }, + { + "epoch": 4.31438127090301, + "grad_norm": 0.7816787362098694, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 3870 + }, + { + "epoch": 4.325529542920847, + "grad_norm": 0.7904975414276123, + "learning_rate": 0.0002, + "loss": 1.4316, + "step": 3880 + }, + { + "epoch": 4.336677814938684, + "grad_norm": 1.0394847393035889, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 3890 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 0.7044078707695007, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3900 + }, + { + "epoch": 4.358974358974359, + "grad_norm": 0.8852819204330444, + "learning_rate": 0.0002, + "loss": 1.3185, + "step": 3910 + }, + { + "epoch": 4.3701226309921966, + "grad_norm": 0.7712758779525757, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3920 + }, + { + "epoch": 4.381270903010034, + "grad_norm": 0.7677774429321289, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 3930 + }, + { + "epoch": 4.392419175027871, + "grad_norm": 0.7450921535491943, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 3940 + }, + { + "epoch": 4.403567447045708, + "grad_norm": 0.7802795767784119, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 3950 + }, + { + "epoch": 4.414715719063545, + "grad_norm": 0.8976508378982544, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 3960 + }, + { + "epoch": 4.425863991081382, + "grad_norm": 0.8148922324180603, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3970 + }, + { + "epoch": 4.43701226309922, + "grad_norm": 0.7490504384040833, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 3980 + }, + { + "epoch": 4.448160535117057, + "grad_norm": 0.753652036190033, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 3990 + }, + { + "epoch": 4.459308807134894, + "grad_norm": 0.803986668586731, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4000 + }, + { + "epoch": 4.470457079152731, + "grad_norm": 0.8643081784248352, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 4010 + }, + { + "epoch": 4.481605351170568, + "grad_norm": 0.8298280835151672, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 4020 + }, + { + "epoch": 4.492753623188406, + "grad_norm": 0.705355703830719, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4030 + }, + { + "epoch": 4.503901895206243, + "grad_norm": 0.7845711708068848, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 4040 + }, + { + "epoch": 4.51505016722408, + "grad_norm": 0.8056256175041199, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 4050 + }, + { + "epoch": 4.5261984392419174, + "grad_norm": 0.7080171704292297, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4060 + }, + { + "epoch": 4.5373467112597545, + "grad_norm": 0.778388261795044, + "learning_rate": 0.0002, + "loss": 1.335, + "step": 4070 + }, + { + "epoch": 4.548494983277592, + "grad_norm": 0.7337639927864075, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 4080 + }, + { + "epoch": 4.55964325529543, + "grad_norm": 0.815322756767273, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4090 + }, + { + "epoch": 4.570791527313267, + "grad_norm": 0.8817179203033447, + "learning_rate": 0.0002, + "loss": 1.4509, + "step": 4100 + }, + { + "epoch": 4.581939799331104, + "grad_norm": 0.7526060342788696, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 4110 + }, + { + "epoch": 4.593088071348941, + "grad_norm": 0.920465350151062, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4120 + }, + { + "epoch": 4.604236343366778, + "grad_norm": 0.7509559392929077, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4130 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.799469530582428, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 4140 + }, + { + "epoch": 4.626532887402453, + "grad_norm": 0.8099892735481262, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4150 + }, + { + "epoch": 4.63768115942029, + "grad_norm": 0.7790375351905823, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4160 + }, + { + "epoch": 4.648829431438127, + "grad_norm": 0.8292977809906006, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 4170 + }, + { + "epoch": 4.659977703455964, + "grad_norm": 0.8312386274337769, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 4180 + }, + { + "epoch": 4.671125975473801, + "grad_norm": 0.7348753809928894, + "learning_rate": 0.0002, + "loss": 1.4301, + "step": 4190 + }, + { + "epoch": 4.682274247491639, + "grad_norm": 0.8006551265716553, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 4200 + }, + { + "epoch": 4.693422519509476, + "grad_norm": 0.8477752804756165, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4210 + }, + { + "epoch": 4.704570791527313, + "grad_norm": 0.7056546211242676, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 4220 + }, + { + "epoch": 4.7157190635451505, + "grad_norm": 0.7858873009681702, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 4230 + }, + { + "epoch": 4.7268673355629875, + "grad_norm": 0.6968740224838257, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 4240 + }, + { + "epoch": 4.738015607580825, + "grad_norm": 0.7886689901351929, + "learning_rate": 0.0002, + "loss": 1.3594, + "step": 4250 + }, + { + "epoch": 4.749163879598662, + "grad_norm": 0.8935304880142212, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4260 + }, + { + "epoch": 4.7603121516165, + "grad_norm": 0.8395553231239319, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 4270 + }, + { + "epoch": 4.771460423634337, + "grad_norm": 0.817263126373291, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4280 + }, + { + "epoch": 4.782608695652174, + "grad_norm": 0.7912008166313171, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 4290 + }, + { + "epoch": 4.793756967670011, + "grad_norm": 0.6637866497039795, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4300 + }, + { + "epoch": 4.804905239687848, + "grad_norm": 1.0709338188171387, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4310 + }, + { + "epoch": 4.816053511705686, + "grad_norm": 0.8179698586463928, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 4320 + }, + { + "epoch": 4.827201783723523, + "grad_norm": 0.7952052354812622, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4330 + }, + { + "epoch": 4.83835005574136, + "grad_norm": 0.7235367894172668, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 4340 + }, + { + "epoch": 4.849498327759197, + "grad_norm": 0.8484606742858887, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 4350 + }, + { + "epoch": 4.860646599777034, + "grad_norm": 0.7344942092895508, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 4360 + }, + { + "epoch": 4.871794871794872, + "grad_norm": 0.9718546867370605, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 4370 + }, + { + "epoch": 4.882943143812709, + "grad_norm": 0.8174259066581726, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 4380 + }, + { + "epoch": 4.894091415830546, + "grad_norm": 0.8097165822982788, + "learning_rate": 0.0002, + "loss": 1.3244, + "step": 4390 + }, + { + "epoch": 4.9052396878483835, + "grad_norm": 0.756388783454895, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4400 + }, + { + "epoch": 4.916387959866221, + "grad_norm": 0.8324617743492126, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 4410 + }, + { + "epoch": 4.927536231884058, + "grad_norm": 0.8949803709983826, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 4420 + }, + { + "epoch": 4.938684503901895, + "grad_norm": 0.7663722634315491, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4430 + }, + { + "epoch": 4.949832775919733, + "grad_norm": 0.7727946043014526, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 4440 + }, + { + "epoch": 4.96098104793757, + "grad_norm": 0.6872350573539734, + "learning_rate": 0.0002, + "loss": 1.4351, + "step": 4450 + }, + { + "epoch": 4.972129319955407, + "grad_norm": 0.754357099533081, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 4460 + }, + { + "epoch": 4.983277591973244, + "grad_norm": 0.8068729639053345, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 4470 + }, + { + "epoch": 4.994425863991081, + "grad_norm": 0.8200556635856628, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 4480 + }, + { + "epoch": 5.0, + "eval_loss": 1.9543706178665161, + "eval_runtime": 37.9369, + "eval_samples_per_second": 13.575, + "eval_steps_per_second": 1.713, + "step": 4485 + }, + { + "epoch": 5.005574136008919, + "grad_norm": 0.7499465942382812, + "learning_rate": 0.0002, + "loss": 1.3194, + "step": 4490 + }, + { + "epoch": 5.016722408026756, + "grad_norm": 1.030434489250183, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 4500 + }, + { + "epoch": 5.027870680044593, + "grad_norm": 0.8914631605148315, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 4510 + }, + { + "epoch": 5.03901895206243, + "grad_norm": 0.9902928471565247, + "learning_rate": 0.0002, + "loss": 1.1448, + "step": 4520 + }, + { + "epoch": 5.050167224080267, + "grad_norm": 0.8338701128959656, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 4530 + }, + { + "epoch": 5.061315496098104, + "grad_norm": 0.9440169334411621, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 4540 + }, + { + "epoch": 5.072463768115942, + "grad_norm": 0.8755099177360535, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 4550 + }, + { + "epoch": 5.083612040133779, + "grad_norm": 0.9145820140838623, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 4560 + }, + { + "epoch": 5.0947603121516165, + "grad_norm": 1.0068492889404297, + "learning_rate": 0.0002, + "loss": 1.147, + "step": 4570 + }, + { + "epoch": 5.105908584169454, + "grad_norm": 0.9184673428535461, + "learning_rate": 0.0002, + "loss": 1.2192, + "step": 4580 + }, + { + "epoch": 5.117056856187291, + "grad_norm": 1.1158655881881714, + "learning_rate": 0.0002, + "loss": 1.2948, + "step": 4590 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 0.9685078263282776, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 4600 + }, + { + "epoch": 5.139353400222966, + "grad_norm": 1.0389559268951416, + "learning_rate": 0.0002, + "loss": 1.2654, + "step": 4610 + }, + { + "epoch": 5.150501672240803, + "grad_norm": 1.0294485092163086, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4620 + }, + { + "epoch": 5.16164994425864, + "grad_norm": 0.9368783235549927, + "learning_rate": 0.0002, + "loss": 1.296, + "step": 4630 + }, + { + "epoch": 5.172798216276477, + "grad_norm": 0.9724945425987244, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 4640 + }, + { + "epoch": 5.183946488294314, + "grad_norm": 0.876488447189331, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 4650 + }, + { + "epoch": 5.195094760312152, + "grad_norm": 0.9106290340423584, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 4660 + }, + { + "epoch": 5.206243032329989, + "grad_norm": 1.0924615859985352, + "learning_rate": 0.0002, + "loss": 1.2896, + "step": 4670 + }, + { + "epoch": 5.217391304347826, + "grad_norm": 1.0379078388214111, + "learning_rate": 0.0002, + "loss": 1.245, + "step": 4680 + }, + { + "epoch": 5.228539576365663, + "grad_norm": 0.9507831931114197, + "learning_rate": 0.0002, + "loss": 1.2155, + "step": 4690 + }, + { + "epoch": 5.2396878483835, + "grad_norm": 1.0408620834350586, + "learning_rate": 0.0002, + "loss": 1.2318, + "step": 4700 + }, + { + "epoch": 5.250836120401337, + "grad_norm": 0.9463635087013245, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 4710 + }, + { + "epoch": 5.261984392419175, + "grad_norm": 0.8919326663017273, + "learning_rate": 0.0002, + "loss": 1.1951, + "step": 4720 + }, + { + "epoch": 5.2731326644370125, + "grad_norm": 1.0364950895309448, + "learning_rate": 0.0002, + "loss": 1.228, + "step": 4730 + }, + { + "epoch": 5.2842809364548495, + "grad_norm": 1.0225472450256348, + "learning_rate": 0.0002, + "loss": 1.2543, + "step": 4740 + }, + { + "epoch": 5.295429208472687, + "grad_norm": 0.816410481929779, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 4750 + }, + { + "epoch": 5.306577480490524, + "grad_norm": 1.0793992280960083, + "learning_rate": 0.0002, + "loss": 1.3601, + "step": 4760 + }, + { + "epoch": 5.317725752508361, + "grad_norm": 1.0203443765640259, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4770 + }, + { + "epoch": 5.328874024526199, + "grad_norm": 1.0731306076049805, + "learning_rate": 0.0002, + "loss": 1.239, + "step": 4780 + }, + { + "epoch": 5.340022296544036, + "grad_norm": 0.9282820224761963, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4790 + }, + { + "epoch": 5.351170568561873, + "grad_norm": 0.9741092920303345, + "learning_rate": 0.0002, + "loss": 1.2159, + "step": 4800 + }, + { + "epoch": 5.36231884057971, + "grad_norm": 1.0683609247207642, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 4810 + }, + { + "epoch": 5.373467112597547, + "grad_norm": 0.9035003781318665, + "learning_rate": 0.0002, + "loss": 1.2316, + "step": 4820 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 1.0590119361877441, + "learning_rate": 0.0002, + "loss": 1.2615, + "step": 4830 + }, + { + "epoch": 5.395763656633222, + "grad_norm": 0.9782686829566956, + "learning_rate": 0.0002, + "loss": 1.2089, + "step": 4840 + }, + { + "epoch": 5.406911928651059, + "grad_norm": 1.036087155342102, + "learning_rate": 0.0002, + "loss": 1.3019, + "step": 4850 + }, + { + "epoch": 5.418060200668896, + "grad_norm": 0.9999949932098389, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 4860 + }, + { + "epoch": 5.429208472686733, + "grad_norm": 0.9094445109367371, + "learning_rate": 0.0002, + "loss": 1.3014, + "step": 4870 + }, + { + "epoch": 5.44035674470457, + "grad_norm": 0.9079708456993103, + "learning_rate": 0.0002, + "loss": 1.2013, + "step": 4880 + }, + { + "epoch": 5.451505016722408, + "grad_norm": 1.0426156520843506, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 4890 + }, + { + "epoch": 5.4626532887402455, + "grad_norm": 1.0110737085342407, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 4900 + }, + { + "epoch": 5.4738015607580826, + "grad_norm": 1.0994000434875488, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 4910 + }, + { + "epoch": 5.48494983277592, + "grad_norm": 0.8988325595855713, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 4920 + }, + { + "epoch": 5.496098104793757, + "grad_norm": 1.0705887079238892, + "learning_rate": 0.0002, + "loss": 1.2694, + "step": 4930 + }, + { + "epoch": 5.507246376811594, + "grad_norm": 1.0268803834915161, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 4940 + }, + { + "epoch": 5.518394648829432, + "grad_norm": 1.0129153728485107, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 4950 + }, + { + "epoch": 5.529542920847269, + "grad_norm": 1.122117280960083, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 4960 + }, + { + "epoch": 5.540691192865106, + "grad_norm": 1.0318635702133179, + "learning_rate": 0.0002, + "loss": 1.2828, + "step": 4970 + }, + { + "epoch": 5.551839464882943, + "grad_norm": 0.9340117573738098, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4980 + }, + { + "epoch": 5.56298773690078, + "grad_norm": 0.9427006244659424, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 4990 + }, + { + "epoch": 5.574136008918618, + "grad_norm": 1.1786518096923828, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 5000 + }, + { + "epoch": 5.585284280936455, + "grad_norm": 1.045157551765442, + "learning_rate": 0.0002, + "loss": 1.2279, + "step": 5010 + }, + { + "epoch": 5.596432552954292, + "grad_norm": 1.0475151538848877, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 5020 + }, + { + "epoch": 5.607580824972129, + "grad_norm": 1.040969729423523, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 5030 + }, + { + "epoch": 5.618729096989966, + "grad_norm": 0.9610048532485962, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 5040 + }, + { + "epoch": 5.6298773690078034, + "grad_norm": 0.9774818420410156, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 5050 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 0.8715312480926514, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 5060 + }, + { + "epoch": 5.6521739130434785, + "grad_norm": 0.9484505653381348, + "learning_rate": 0.0002, + "loss": 1.3077, + "step": 5070 + }, + { + "epoch": 5.663322185061316, + "grad_norm": 0.8292845487594604, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 5080 + }, + { + "epoch": 5.674470457079153, + "grad_norm": 0.9876886606216431, + "learning_rate": 0.0002, + "loss": 1.2357, + "step": 5090 + }, + { + "epoch": 5.68561872909699, + "grad_norm": 0.9899171590805054, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 5100 + }, + { + "epoch": 5.696767001114827, + "grad_norm": 0.9693286418914795, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 5110 + }, + { + "epoch": 5.707915273132665, + "grad_norm": 0.958905816078186, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 5120 + }, + { + "epoch": 5.719063545150502, + "grad_norm": 0.9924837350845337, + "learning_rate": 0.0002, + "loss": 1.2889, + "step": 5130 + }, + { + "epoch": 5.730211817168339, + "grad_norm": 0.9551714062690735, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 5140 + }, + { + "epoch": 5.741360089186176, + "grad_norm": 1.0407027006149292, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 5150 + }, + { + "epoch": 5.752508361204013, + "grad_norm": 0.9688791036605835, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5160 + }, + { + "epoch": 5.763656633221851, + "grad_norm": 1.0091899633407593, + "learning_rate": 0.0002, + "loss": 1.1424, + "step": 5170 + }, + { + "epoch": 5.774804905239688, + "grad_norm": 0.9393984079360962, + "learning_rate": 0.0002, + "loss": 1.2575, + "step": 5180 + }, + { + "epoch": 5.785953177257525, + "grad_norm": 1.1439075469970703, + "learning_rate": 0.0002, + "loss": 1.2177, + "step": 5190 + }, + { + "epoch": 5.797101449275362, + "grad_norm": 1.0178622007369995, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 5200 + }, + { + "epoch": 5.808249721293199, + "grad_norm": 0.8440285921096802, + "learning_rate": 0.0002, + "loss": 1.3317, + "step": 5210 + }, + { + "epoch": 5.8193979933110365, + "grad_norm": 0.856838583946228, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 5220 + }, + { + "epoch": 5.8305462653288735, + "grad_norm": 0.8676707148551941, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 5230 + }, + { + "epoch": 5.8416945373467115, + "grad_norm": 1.1034743785858154, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 5240 + }, + { + "epoch": 5.852842809364549, + "grad_norm": 0.9631003737449646, + "learning_rate": 0.0002, + "loss": 1.2473, + "step": 5250 + }, + { + "epoch": 5.863991081382386, + "grad_norm": 1.0478793382644653, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 5260 + }, + { + "epoch": 5.875139353400223, + "grad_norm": 0.9819806218147278, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5270 + }, + { + "epoch": 5.88628762541806, + "grad_norm": 0.8572421073913574, + "learning_rate": 0.0002, + "loss": 1.2817, + "step": 5280 + }, + { + "epoch": 5.897435897435898, + "grad_norm": 0.9328814148902893, + "learning_rate": 0.0002, + "loss": 1.246, + "step": 5290 + }, + { + "epoch": 5.908584169453735, + "grad_norm": 1.000305414199829, + "learning_rate": 0.0002, + "loss": 1.3016, + "step": 5300 + }, + { + "epoch": 5.919732441471572, + "grad_norm": 1.1006377935409546, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 5310 + }, + { + "epoch": 5.930880713489409, + "grad_norm": 0.963198721408844, + "learning_rate": 0.0002, + "loss": 1.3317, + "step": 5320 + }, + { + "epoch": 5.942028985507246, + "grad_norm": 0.8952236175537109, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 5330 + }, + { + "epoch": 5.953177257525084, + "grad_norm": 1.0945496559143066, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 5340 + }, + { + "epoch": 5.964325529542921, + "grad_norm": 1.0053467750549316, + "learning_rate": 0.0002, + "loss": 1.2768, + "step": 5350 + }, + { + "epoch": 5.975473801560758, + "grad_norm": 1.032088279724121, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 5360 + }, + { + "epoch": 5.986622073578595, + "grad_norm": 1.1068958044052124, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 5370 + }, + { + "epoch": 5.997770345596432, + "grad_norm": 1.0064235925674438, + "learning_rate": 0.0002, + "loss": 1.2468, + "step": 5380 + }, + { + "epoch": 6.0, + "eval_loss": 2.0690135955810547, + "eval_runtime": 38.1748, + "eval_samples_per_second": 13.491, + "eval_steps_per_second": 1.703, + "step": 5382 + }, + { + "epoch": 6.0089186176142695, + "grad_norm": 0.9700132608413696, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 5390 + }, + { + "epoch": 6.0200668896321075, + "grad_norm": 1.159369707107544, + "learning_rate": 0.0002, + "loss": 1.097, + "step": 5400 + }, + { + "epoch": 6.0312151616499445, + "grad_norm": 1.332871913909912, + "learning_rate": 0.0002, + "loss": 1.0646, + "step": 5410 + }, + { + "epoch": 6.042363433667782, + "grad_norm": 1.2239890098571777, + "learning_rate": 0.0002, + "loss": 1.0882, + "step": 5420 + }, + { + "epoch": 6.053511705685619, + "grad_norm": 1.5238478183746338, + "learning_rate": 0.0002, + "loss": 1.0505, + "step": 5430 + }, + { + "epoch": 6.064659977703456, + "grad_norm": 1.24699068069458, + "learning_rate": 0.0002, + "loss": 1.1423, + "step": 5440 + }, + { + "epoch": 6.075808249721293, + "grad_norm": 1.0891860723495483, + "learning_rate": 0.0002, + "loss": 1.0789, + "step": 5450 + }, + { + "epoch": 6.086956521739131, + "grad_norm": 1.2695465087890625, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 5460 + }, + { + "epoch": 6.098104793756968, + "grad_norm": 1.0630067586898804, + "learning_rate": 0.0002, + "loss": 1.0728, + "step": 5470 + }, + { + "epoch": 6.109253065774805, + "grad_norm": 0.9666808247566223, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 5480 + }, + { + "epoch": 6.120401337792642, + "grad_norm": 0.8925976157188416, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 5490 + }, + { + "epoch": 6.131549609810479, + "grad_norm": 1.0824475288391113, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 5500 + }, + { + "epoch": 6.142697881828316, + "grad_norm": 1.2315316200256348, + "learning_rate": 0.0002, + "loss": 1.1568, + "step": 5510 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 1.2484779357910156, + "learning_rate": 0.0002, + "loss": 1.0896, + "step": 5520 + }, + { + "epoch": 6.164994425863991, + "grad_norm": 1.2468485832214355, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 5530 + }, + { + "epoch": 6.176142697881828, + "grad_norm": 1.0837156772613525, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 5540 + }, + { + "epoch": 6.187290969899665, + "grad_norm": 1.1650336980819702, + "learning_rate": 0.0002, + "loss": 1.1042, + "step": 5550 + }, + { + "epoch": 6.1984392419175025, + "grad_norm": 1.2004241943359375, + "learning_rate": 0.0002, + "loss": 1.0495, + "step": 5560 + }, + { + "epoch": 6.20958751393534, + "grad_norm": 1.0223793983459473, + "learning_rate": 0.0002, + "loss": 1.023, + "step": 5570 + }, + { + "epoch": 6.2207357859531776, + "grad_norm": 1.4045847654342651, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 5580 + }, + { + "epoch": 6.231884057971015, + "grad_norm": 1.3042256832122803, + "learning_rate": 0.0002, + "loss": 1.1168, + "step": 5590 + }, + { + "epoch": 6.243032329988852, + "grad_norm": 1.1762887239456177, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 5600 + }, + { + "epoch": 6.254180602006689, + "grad_norm": 1.1739851236343384, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 5610 + }, + { + "epoch": 6.265328874024526, + "grad_norm": 1.2904260158538818, + "learning_rate": 0.0002, + "loss": 1.1004, + "step": 5620 + }, + { + "epoch": 6.276477146042364, + "grad_norm": 1.3218393325805664, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 5630 + }, + { + "epoch": 6.287625418060201, + "grad_norm": 1.241175889968872, + "learning_rate": 0.0002, + "loss": 1.0876, + "step": 5640 + }, + { + "epoch": 6.298773690078038, + "grad_norm": 1.2916349172592163, + "learning_rate": 0.0002, + "loss": 1.128, + "step": 5650 + }, + { + "epoch": 6.309921962095875, + "grad_norm": 1.5129448175430298, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 5660 + }, + { + "epoch": 6.321070234113712, + "grad_norm": 1.0297393798828125, + "learning_rate": 0.0002, + "loss": 1.0723, + "step": 5670 + }, + { + "epoch": 6.332218506131549, + "grad_norm": 1.1127521991729736, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 5680 + }, + { + "epoch": 6.343366778149387, + "grad_norm": 1.0972518920898438, + "learning_rate": 0.0002, + "loss": 1.0305, + "step": 5690 + }, + { + "epoch": 6.354515050167224, + "grad_norm": 1.4237337112426758, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 5700 + }, + { + "epoch": 6.365663322185061, + "grad_norm": 1.121502161026001, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 5710 + }, + { + "epoch": 6.3768115942028984, + "grad_norm": 1.1007202863693237, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 5720 + }, + { + "epoch": 6.3879598662207355, + "grad_norm": 1.1609363555908203, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 5730 + }, + { + "epoch": 6.399108138238573, + "grad_norm": 1.3008915185928345, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 5740 + }, + { + "epoch": 6.410256410256411, + "grad_norm": 1.184460163116455, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 5750 + }, + { + "epoch": 6.421404682274248, + "grad_norm": 1.2092398405075073, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5760 + }, + { + "epoch": 6.432552954292085, + "grad_norm": 1.2273279428482056, + "learning_rate": 0.0002, + "loss": 1.093, + "step": 5770 + }, + { + "epoch": 6.443701226309922, + "grad_norm": 1.0721677541732788, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 5780 + }, + { + "epoch": 6.454849498327759, + "grad_norm": 1.1679279804229736, + "learning_rate": 0.0002, + "loss": 1.0585, + "step": 5790 + }, + { + "epoch": 6.465997770345597, + "grad_norm": 1.3658736944198608, + "learning_rate": 0.0002, + "loss": 1.0795, + "step": 5800 + }, + { + "epoch": 6.477146042363434, + "grad_norm": 1.2440944910049438, + "learning_rate": 0.0002, + "loss": 1.0951, + "step": 5810 + }, + { + "epoch": 6.488294314381271, + "grad_norm": 1.1838182210922241, + "learning_rate": 0.0002, + "loss": 1.0815, + "step": 5820 + }, + { + "epoch": 6.499442586399108, + "grad_norm": 1.1993956565856934, + "learning_rate": 0.0002, + "loss": 1.0543, + "step": 5830 + }, + { + "epoch": 6.510590858416945, + "grad_norm": 1.1028285026550293, + "learning_rate": 0.0002, + "loss": 1.1587, + "step": 5840 + }, + { + "epoch": 6.521739130434782, + "grad_norm": 1.2117441892623901, + "learning_rate": 0.0002, + "loss": 1.1245, + "step": 5850 + }, + { + "epoch": 6.53288740245262, + "grad_norm": 1.2012946605682373, + "learning_rate": 0.0002, + "loss": 1.1237, + "step": 5860 + }, + { + "epoch": 6.544035674470457, + "grad_norm": 1.2491029500961304, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 5870 + }, + { + "epoch": 6.555183946488294, + "grad_norm": 1.4130326509475708, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 5880 + }, + { + "epoch": 6.5663322185061315, + "grad_norm": 1.2596930265426636, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 5890 + }, + { + "epoch": 6.5774804905239685, + "grad_norm": 1.32266104221344, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 5900 + }, + { + "epoch": 6.588628762541806, + "grad_norm": 1.3093374967575073, + "learning_rate": 0.0002, + "loss": 1.169, + "step": 5910 + }, + { + "epoch": 6.599777034559644, + "grad_norm": 1.0436453819274902, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 5920 + }, + { + "epoch": 6.610925306577481, + "grad_norm": 1.064468502998352, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 5930 + }, + { + "epoch": 6.622073578595318, + "grad_norm": 1.2561777830123901, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 5940 + }, + { + "epoch": 6.633221850613155, + "grad_norm": 1.2759621143341064, + "learning_rate": 0.0002, + "loss": 1.1088, + "step": 5950 + }, + { + "epoch": 6.644370122630992, + "grad_norm": 1.0602868795394897, + "learning_rate": 0.0002, + "loss": 1.1103, + "step": 5960 + }, + { + "epoch": 6.65551839464883, + "grad_norm": 1.2336751222610474, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 5970 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.1773011684417725, + "learning_rate": 0.0002, + "loss": 1.1264, + "step": 5980 + }, + { + "epoch": 6.677814938684504, + "grad_norm": 1.0779681205749512, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 5990 + }, + { + "epoch": 6.688963210702341, + "grad_norm": 1.396223783493042, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 6000 + }, + { + "epoch": 6.700111482720178, + "grad_norm": 1.2238768339157104, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 6010 + }, + { + "epoch": 6.711259754738015, + "grad_norm": 1.1152666807174683, + "learning_rate": 0.0002, + "loss": 1.098, + "step": 6020 + }, + { + "epoch": 6.722408026755852, + "grad_norm": 1.2376031875610352, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 6030 + }, + { + "epoch": 6.73355629877369, + "grad_norm": 1.0868488550186157, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 6040 + }, + { + "epoch": 6.744704570791527, + "grad_norm": 1.265913724899292, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 6050 + }, + { + "epoch": 6.7558528428093645, + "grad_norm": 1.1551072597503662, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 6060 + }, + { + "epoch": 6.767001114827202, + "grad_norm": 1.0813109874725342, + "learning_rate": 0.0002, + "loss": 1.1395, + "step": 6070 + }, + { + "epoch": 6.778149386845039, + "grad_norm": 1.2367933988571167, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 6080 + }, + { + "epoch": 6.789297658862877, + "grad_norm": 1.1612437963485718, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 6090 + }, + { + "epoch": 6.800445930880714, + "grad_norm": 1.2715837955474854, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 6100 + }, + { + "epoch": 6.811594202898551, + "grad_norm": 1.1385036706924438, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 6110 + }, + { + "epoch": 6.822742474916388, + "grad_norm": 1.4322341680526733, + "learning_rate": 0.0002, + "loss": 1.137, + "step": 6120 + }, + { + "epoch": 6.833890746934225, + "grad_norm": 1.2975877523422241, + "learning_rate": 0.0002, + "loss": 1.1571, + "step": 6130 + }, + { + "epoch": 6.845039018952063, + "grad_norm": 1.0241044759750366, + "learning_rate": 0.0002, + "loss": 1.1592, + "step": 6140 + }, + { + "epoch": 6.8561872909699, + "grad_norm": 1.352594017982483, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 6150 + }, + { + "epoch": 6.867335562987737, + "grad_norm": 1.1166167259216309, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 6160 + }, + { + "epoch": 6.878483835005574, + "grad_norm": 1.1596941947937012, + "learning_rate": 0.0002, + "loss": 1.1409, + "step": 6170 + }, + { + "epoch": 6.889632107023411, + "grad_norm": 1.5753912925720215, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 6180 + }, + { + "epoch": 6.900780379041248, + "grad_norm": 1.1857494115829468, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 6190 + }, + { + "epoch": 6.911928651059085, + "grad_norm": 1.1507896184921265, + "learning_rate": 0.0002, + "loss": 1.137, + "step": 6200 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 1.5194647312164307, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 6210 + }, + { + "epoch": 6.93422519509476, + "grad_norm": 1.1627732515335083, + "learning_rate": 0.0002, + "loss": 1.1315, + "step": 6220 + }, + { + "epoch": 6.9453734671125975, + "grad_norm": 1.1929609775543213, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6230 + }, + { + "epoch": 6.956521739130435, + "grad_norm": 1.2704664468765259, + "learning_rate": 0.0002, + "loss": 1.1331, + "step": 6240 + }, + { + "epoch": 6.967670011148272, + "grad_norm": 1.1791198253631592, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 6250 + }, + { + "epoch": 6.97881828316611, + "grad_norm": 1.1948790550231934, + "learning_rate": 0.0002, + "loss": 1.1152, + "step": 6260 + }, + { + "epoch": 6.989966555183947, + "grad_norm": 1.222116231918335, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 6270 + }, + { + "epoch": 7.0, + "eval_loss": 2.174532890319824, + "eval_runtime": 38.0962, + "eval_samples_per_second": 13.518, + "eval_steps_per_second": 1.706, + "step": 6279 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.9057806074957005e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-6279/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..49b05f787b7b46c450bc8a0cf591d6065f095b5c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb672f96be3842bcdb92b4e031e583160b23f308fc4a899638c88b24af16eee +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c79f0801fa9a52428400cda152d7a324b0b6e6a4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd847fc62fd0607603cd4c6c69422580fec044ef8830b257a8cf9c1a6d82648 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c0fafbdc7686213e9c56e665fef1f891312ae76c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b7620037378ff4978f83f0fc9f166d4d5601e72654c178a90f84506818b9af8 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a73cede2af78c341babf360baffbc26902e9eaeb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83e5cad7b02276591343cfceae1997dd6a68c40dbaedb4b43fdb6414163f62c5 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c7b0ee5dc84802ba4569ac40f1504bc681230080 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/trainer_state.json @@ -0,0 +1,5116 @@ +{ + "best_metric": 1.8116765022277832, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 7176, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + }, + { + "epoch": 1.0033444816053512, + "grad_norm": 0.29757317900657654, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 900 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.32168492674827576, + "learning_rate": 0.0002, + "loss": 1.7376, + "step": 910 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.3430717885494232, + "learning_rate": 0.0002, + "loss": 1.6785, + "step": 920 + }, + { + "epoch": 1.0367892976588629, + "grad_norm": 0.3431745767593384, + "learning_rate": 0.0002, + "loss": 1.7356, + "step": 930 + }, + { + "epoch": 1.0479375696767002, + "grad_norm": 0.39787548780441284, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 940 + }, + { + "epoch": 1.0590858416945372, + "grad_norm": 0.3540935218334198, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 950 + }, + { + "epoch": 1.0702341137123745, + "grad_norm": 0.368484765291214, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 960 + }, + { + "epoch": 1.0813823857302118, + "grad_norm": 0.41324466466903687, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 970 + }, + { + "epoch": 1.0925306577480491, + "grad_norm": 0.3696419596672058, + "learning_rate": 0.0002, + "loss": 1.7288, + "step": 980 + }, + { + "epoch": 1.1036789297658862, + "grad_norm": 0.33832886815071106, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 990 + }, + { + "epoch": 1.1148272017837235, + "grad_norm": 0.4411991834640503, + "learning_rate": 0.0002, + "loss": 1.7445, + "step": 1000 + }, + { + "epoch": 1.1259754738015608, + "grad_norm": 0.3935333788394928, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 1010 + }, + { + "epoch": 1.137123745819398, + "grad_norm": 0.32472893595695496, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1020 + }, + { + "epoch": 1.1482720178372352, + "grad_norm": 0.3455545902252197, + "learning_rate": 0.0002, + "loss": 1.6974, + "step": 1030 + }, + { + "epoch": 1.1594202898550725, + "grad_norm": 0.3995654582977295, + "learning_rate": 0.0002, + "loss": 1.7555, + "step": 1040 + }, + { + "epoch": 1.1705685618729098, + "grad_norm": 0.384056031703949, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1050 + }, + { + "epoch": 1.1817168338907469, + "grad_norm": 0.4345705211162567, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1060 + }, + { + "epoch": 1.1928651059085842, + "grad_norm": 0.3524057865142822, + "learning_rate": 0.0002, + "loss": 1.7219, + "step": 1070 + }, + { + "epoch": 1.2040133779264215, + "grad_norm": 0.4047132134437561, + "learning_rate": 0.0002, + "loss": 1.6701, + "step": 1080 + }, + { + "epoch": 1.2151616499442586, + "grad_norm": 0.365824431180954, + "learning_rate": 0.0002, + "loss": 1.7035, + "step": 1090 + }, + { + "epoch": 1.2263099219620959, + "grad_norm": 0.37048354744911194, + "learning_rate": 0.0002, + "loss": 1.7367, + "step": 1100 + }, + { + "epoch": 1.2374581939799332, + "grad_norm": 0.3753672242164612, + "learning_rate": 0.0002, + "loss": 1.7503, + "step": 1110 + }, + { + "epoch": 1.2486064659977703, + "grad_norm": 0.37887042760849, + "learning_rate": 0.0002, + "loss": 1.6984, + "step": 1120 + }, + { + "epoch": 1.2597547380156076, + "grad_norm": 0.3896579444408417, + "learning_rate": 0.0002, + "loss": 1.7866, + "step": 1130 + }, + { + "epoch": 1.2709030100334449, + "grad_norm": 0.3725394010543823, + "learning_rate": 0.0002, + "loss": 1.8085, + "step": 1140 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 0.373989999294281, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 1150 + }, + { + "epoch": 1.2931995540691192, + "grad_norm": 0.4412260353565216, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 1160 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.38538658618927, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1170 + }, + { + "epoch": 1.3154960981047936, + "grad_norm": 0.3644104599952698, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1180 + }, + { + "epoch": 1.326644370122631, + "grad_norm": 0.3615347743034363, + "learning_rate": 0.0002, + "loss": 1.6186, + "step": 1190 + }, + { + "epoch": 1.3377926421404682, + "grad_norm": 0.4260489046573639, + "learning_rate": 0.0002, + "loss": 1.7575, + "step": 1200 + }, + { + "epoch": 1.3489409141583055, + "grad_norm": 0.35236871242523193, + "learning_rate": 0.0002, + "loss": 1.762, + "step": 1210 + }, + { + "epoch": 1.3600891861761428, + "grad_norm": 0.45456627011299133, + "learning_rate": 0.0002, + "loss": 1.7207, + "step": 1220 + }, + { + "epoch": 1.37123745819398, + "grad_norm": 0.391541063785553, + "learning_rate": 0.0002, + "loss": 1.7391, + "step": 1230 + }, + { + "epoch": 1.3823857302118172, + "grad_norm": 0.37955328822135925, + "learning_rate": 0.0002, + "loss": 1.7309, + "step": 1240 + }, + { + "epoch": 1.3935340022296545, + "grad_norm": 0.36955225467681885, + "learning_rate": 0.0002, + "loss": 1.7028, + "step": 1250 + }, + { + "epoch": 1.4046822742474916, + "grad_norm": 0.36156216263771057, + "learning_rate": 0.0002, + "loss": 1.7027, + "step": 1260 + }, + { + "epoch": 1.415830546265329, + "grad_norm": 0.4083487391471863, + "learning_rate": 0.0002, + "loss": 1.8091, + "step": 1270 + }, + { + "epoch": 1.4269788182831662, + "grad_norm": 0.420171320438385, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 1280 + }, + { + "epoch": 1.4381270903010033, + "grad_norm": 0.3581725060939789, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1290 + }, + { + "epoch": 1.4492753623188406, + "grad_norm": 0.3657953441143036, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1300 + }, + { + "epoch": 1.4604236343366779, + "grad_norm": 0.3139931857585907, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 1310 + }, + { + "epoch": 1.471571906354515, + "grad_norm": 0.37750574946403503, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 1320 + }, + { + "epoch": 1.4827201783723523, + "grad_norm": 0.37787437438964844, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1330 + }, + { + "epoch": 1.4938684503901896, + "grad_norm": 0.39505279064178467, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 1340 + }, + { + "epoch": 1.5050167224080266, + "grad_norm": 0.39977672696113586, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 1350 + }, + { + "epoch": 1.516164994425864, + "grad_norm": 0.4395383298397064, + "learning_rate": 0.0002, + "loss": 1.7339, + "step": 1360 + }, + { + "epoch": 1.5273132664437012, + "grad_norm": 0.3452998995780945, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 1370 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.39573904871940613, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1380 + }, + { + "epoch": 1.5496098104793758, + "grad_norm": 0.4886358976364136, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 1390 + }, + { + "epoch": 1.560758082497213, + "grad_norm": 0.35525891184806824, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 1400 + }, + { + "epoch": 1.57190635451505, + "grad_norm": 0.3873274028301239, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1410 + }, + { + "epoch": 1.5830546265328875, + "grad_norm": 0.35162487626075745, + "learning_rate": 0.0002, + "loss": 1.7545, + "step": 1420 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.3533175587654114, + "learning_rate": 0.0002, + "loss": 1.7403, + "step": 1430 + }, + { + "epoch": 1.605351170568562, + "grad_norm": 0.35397887229919434, + "learning_rate": 0.0002, + "loss": 1.7199, + "step": 1440 + }, + { + "epoch": 1.6164994425863992, + "grad_norm": 0.3539091646671295, + "learning_rate": 0.0002, + "loss": 1.701, + "step": 1450 + }, + { + "epoch": 1.6276477146042363, + "grad_norm": 0.38557013869285583, + "learning_rate": 0.0002, + "loss": 1.7407, + "step": 1460 + }, + { + "epoch": 1.6387959866220736, + "grad_norm": 0.3591409921646118, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1470 + }, + { + "epoch": 1.649944258639911, + "grad_norm": 0.3776722848415375, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1480 + }, + { + "epoch": 1.661092530657748, + "grad_norm": 0.3761521875858307, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 1490 + }, + { + "epoch": 1.6722408026755853, + "grad_norm": 0.33939364552497864, + "learning_rate": 0.0002, + "loss": 1.7464, + "step": 1500 + }, + { + "epoch": 1.6833890746934226, + "grad_norm": 0.3961067795753479, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 1510 + }, + { + "epoch": 1.6945373467112597, + "grad_norm": 0.36793094873428345, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1520 + }, + { + "epoch": 1.705685618729097, + "grad_norm": 0.4201025068759918, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 1530 + }, + { + "epoch": 1.7168338907469343, + "grad_norm": 0.382280558347702, + "learning_rate": 0.0002, + "loss": 1.6656, + "step": 1540 + }, + { + "epoch": 1.7279821627647713, + "grad_norm": 0.4504372477531433, + "learning_rate": 0.0002, + "loss": 1.7987, + "step": 1550 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.36121585965156555, + "learning_rate": 0.0002, + "loss": 1.7889, + "step": 1560 + }, + { + "epoch": 1.750278706800446, + "grad_norm": 0.38416755199432373, + "learning_rate": 0.0002, + "loss": 1.7282, + "step": 1570 + }, + { + "epoch": 1.761426978818283, + "grad_norm": 0.3920411467552185, + "learning_rate": 0.0002, + "loss": 1.7759, + "step": 1580 + }, + { + "epoch": 1.7725752508361206, + "grad_norm": 0.4326777756214142, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 1590 + }, + { + "epoch": 1.7837235228539576, + "grad_norm": 0.3582489490509033, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 1600 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 0.36345767974853516, + "learning_rate": 0.0002, + "loss": 1.706, + "step": 1610 + }, + { + "epoch": 1.8060200668896322, + "grad_norm": 0.3951990008354187, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1620 + }, + { + "epoch": 1.8171683389074693, + "grad_norm": 0.35174235701560974, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 1630 + }, + { + "epoch": 1.8283166109253066, + "grad_norm": 0.37005263566970825, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1640 + }, + { + "epoch": 1.839464882943144, + "grad_norm": 0.42875173687934875, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 1650 + }, + { + "epoch": 1.850613154960981, + "grad_norm": 0.3646032512187958, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1660 + }, + { + "epoch": 1.8617614269788183, + "grad_norm": 0.38111618161201477, + "learning_rate": 0.0002, + "loss": 1.6698, + "step": 1670 + }, + { + "epoch": 1.8729096989966556, + "grad_norm": 0.3825555443763733, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1680 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.36418095231056213, + "learning_rate": 0.0002, + "loss": 1.7599, + "step": 1690 + }, + { + "epoch": 1.89520624303233, + "grad_norm": 0.36551007628440857, + "learning_rate": 0.0002, + "loss": 1.6532, + "step": 1700 + }, + { + "epoch": 1.9063545150501673, + "grad_norm": 0.36421480774879456, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1710 + }, + { + "epoch": 1.9175027870680044, + "grad_norm": 0.3791242241859436, + "learning_rate": 0.0002, + "loss": 1.7176, + "step": 1720 + }, + { + "epoch": 1.9286510590858417, + "grad_norm": 0.36655193567276, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1730 + }, + { + "epoch": 1.939799331103679, + "grad_norm": 0.3526945412158966, + "learning_rate": 0.0002, + "loss": 1.7765, + "step": 1740 + }, + { + "epoch": 1.950947603121516, + "grad_norm": 0.41139861941337585, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 1750 + }, + { + "epoch": 1.9620958751393534, + "grad_norm": 0.41757065057754517, + "learning_rate": 0.0002, + "loss": 1.8155, + "step": 1760 + }, + { + "epoch": 1.9732441471571907, + "grad_norm": 0.38956186175346375, + "learning_rate": 0.0002, + "loss": 1.7271, + "step": 1770 + }, + { + "epoch": 1.9843924191750277, + "grad_norm": 0.33891627192497253, + "learning_rate": 0.0002, + "loss": 1.7653, + "step": 1780 + }, + { + "epoch": 1.9955406911928653, + "grad_norm": 0.42879191040992737, + "learning_rate": 0.0002, + "loss": 1.7305, + "step": 1790 + }, + { + "epoch": 2.0, + "eval_loss": 1.8116765022277832, + "eval_runtime": 37.9859, + "eval_samples_per_second": 13.558, + "eval_steps_per_second": 1.711, + "step": 1794 + }, + { + "epoch": 2.0066889632107023, + "grad_norm": 0.42103368043899536, + "learning_rate": 0.0002, + "loss": 1.6724, + "step": 1800 + }, + { + "epoch": 2.0178372352285394, + "grad_norm": 0.41505053639411926, + "learning_rate": 0.0002, + "loss": 1.5812, + "step": 1810 + }, + { + "epoch": 2.028985507246377, + "grad_norm": 0.398190438747406, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 1820 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.4371621310710907, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1830 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.45679208636283875, + "learning_rate": 0.0002, + "loss": 1.6501, + "step": 1840 + }, + { + "epoch": 2.0624303232998886, + "grad_norm": 0.43211811780929565, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 1850 + }, + { + "epoch": 2.0735785953177257, + "grad_norm": 0.47492915391921997, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 1860 + }, + { + "epoch": 2.084726867335563, + "grad_norm": 0.41742339730262756, + "learning_rate": 0.0002, + "loss": 1.7169, + "step": 1870 + }, + { + "epoch": 2.0958751393534003, + "grad_norm": 0.45789217948913574, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1880 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.43958935141563416, + "learning_rate": 0.0002, + "loss": 1.6896, + "step": 1890 + }, + { + "epoch": 2.1181716833890745, + "grad_norm": 0.43991968035697937, + "learning_rate": 0.0002, + "loss": 1.6444, + "step": 1900 + }, + { + "epoch": 2.129319955406912, + "grad_norm": 0.4667953848838806, + "learning_rate": 0.0002, + "loss": 1.6057, + "step": 1910 + }, + { + "epoch": 2.140468227424749, + "grad_norm": 0.42225760221481323, + "learning_rate": 0.0002, + "loss": 1.5999, + "step": 1920 + }, + { + "epoch": 2.1516164994425866, + "grad_norm": 0.418850839138031, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1930 + }, + { + "epoch": 2.1627647714604237, + "grad_norm": 0.43838515877723694, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 1940 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.43798115849494934, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 1950 + }, + { + "epoch": 2.1850613154960983, + "grad_norm": 0.4456610679626465, + "learning_rate": 0.0002, + "loss": 1.632, + "step": 1960 + }, + { + "epoch": 2.1962095875139354, + "grad_norm": 0.4619026482105255, + "learning_rate": 0.0002, + "loss": 1.6338, + "step": 1970 + }, + { + "epoch": 2.2073578595317724, + "grad_norm": 0.4732453525066376, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 1980 + }, + { + "epoch": 2.21850613154961, + "grad_norm": 0.42551836371421814, + "learning_rate": 0.0002, + "loss": 1.581, + "step": 1990 + }, + { + "epoch": 2.229654403567447, + "grad_norm": 0.45154353976249695, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2000 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.4655696451663971, + "learning_rate": 0.0002, + "loss": 1.6768, + "step": 2010 + }, + { + "epoch": 2.2519509476031216, + "grad_norm": 0.5363447666168213, + "learning_rate": 0.0002, + "loss": 1.6972, + "step": 2020 + }, + { + "epoch": 2.2630992196209587, + "grad_norm": 0.4839927852153778, + "learning_rate": 0.0002, + "loss": 1.6561, + "step": 2030 + }, + { + "epoch": 2.274247491638796, + "grad_norm": 0.4639221727848053, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 2040 + }, + { + "epoch": 2.2853957636566333, + "grad_norm": 0.46169278025627136, + "learning_rate": 0.0002, + "loss": 1.6063, + "step": 2050 + }, + { + "epoch": 2.2965440356744704, + "grad_norm": 0.4582304060459137, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 2060 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.48619818687438965, + "learning_rate": 0.0002, + "loss": 1.5778, + "step": 2070 + }, + { + "epoch": 2.318840579710145, + "grad_norm": 0.4382200241088867, + "learning_rate": 0.0002, + "loss": 1.633, + "step": 2080 + }, + { + "epoch": 2.329988851727982, + "grad_norm": 0.4103265106678009, + "learning_rate": 0.0002, + "loss": 1.5854, + "step": 2090 + }, + { + "epoch": 2.3411371237458196, + "grad_norm": 0.5136023759841919, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 2100 + }, + { + "epoch": 2.3522853957636567, + "grad_norm": 0.46723702549934387, + "learning_rate": 0.0002, + "loss": 1.5723, + "step": 2110 + }, + { + "epoch": 2.3634336677814938, + "grad_norm": 0.42269468307495117, + "learning_rate": 0.0002, + "loss": 1.6852, + "step": 2120 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 0.42611163854599, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2130 + }, + { + "epoch": 2.3857302118171684, + "grad_norm": 0.4573901891708374, + "learning_rate": 0.0002, + "loss": 1.5879, + "step": 2140 + }, + { + "epoch": 2.3968784838350055, + "grad_norm": 0.4758673310279846, + "learning_rate": 0.0002, + "loss": 1.6317, + "step": 2150 + }, + { + "epoch": 2.408026755852843, + "grad_norm": 0.49616846442222595, + "learning_rate": 0.0002, + "loss": 1.6527, + "step": 2160 + }, + { + "epoch": 2.41917502787068, + "grad_norm": 0.5278240442276001, + "learning_rate": 0.0002, + "loss": 1.5796, + "step": 2170 + }, + { + "epoch": 2.430323299888517, + "grad_norm": 0.46806028485298157, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 2180 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 0.44507312774658203, + "learning_rate": 0.0002, + "loss": 1.676, + "step": 2190 + }, + { + "epoch": 2.4526198439241917, + "grad_norm": 0.45716050267219543, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 2200 + }, + { + "epoch": 2.463768115942029, + "grad_norm": 0.4226573705673218, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2210 + }, + { + "epoch": 2.4749163879598663, + "grad_norm": 0.4488418400287628, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2220 + }, + { + "epoch": 2.4860646599777034, + "grad_norm": 0.48324450850486755, + "learning_rate": 0.0002, + "loss": 1.6399, + "step": 2230 + }, + { + "epoch": 2.4972129319955405, + "grad_norm": 0.4866982400417328, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2240 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.4784172773361206, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 2250 + }, + { + "epoch": 2.519509476031215, + "grad_norm": 0.4250621199607849, + "learning_rate": 0.0002, + "loss": 1.6905, + "step": 2260 + }, + { + "epoch": 2.5306577480490526, + "grad_norm": 0.431224524974823, + "learning_rate": 0.0002, + "loss": 1.6582, + "step": 2270 + }, + { + "epoch": 2.5418060200668897, + "grad_norm": 0.3931371867656708, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2280 + }, + { + "epoch": 2.552954292084727, + "grad_norm": 0.4800887703895569, + "learning_rate": 0.0002, + "loss": 1.6897, + "step": 2290 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.4288487136363983, + "learning_rate": 0.0002, + "loss": 1.6205, + "step": 2300 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.48489660024642944, + "learning_rate": 0.0002, + "loss": 1.6005, + "step": 2310 + }, + { + "epoch": 2.5863991081382385, + "grad_norm": 0.4221740961074829, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 2320 + }, + { + "epoch": 2.597547380156076, + "grad_norm": 0.4413852393627167, + "learning_rate": 0.0002, + "loss": 1.666, + "step": 2330 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.4391345679759979, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2340 + }, + { + "epoch": 2.61984392419175, + "grad_norm": 0.4824720323085785, + "learning_rate": 0.0002, + "loss": 1.6942, + "step": 2350 + }, + { + "epoch": 2.6309921962095872, + "grad_norm": 0.4023158550262451, + "learning_rate": 0.0002, + "loss": 1.5615, + "step": 2360 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 0.5107841491699219, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 2370 + }, + { + "epoch": 2.653288740245262, + "grad_norm": 0.4705312252044678, + "learning_rate": 0.0002, + "loss": 1.6258, + "step": 2380 + }, + { + "epoch": 2.6644370122630994, + "grad_norm": 0.4420899450778961, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 2390 + }, + { + "epoch": 2.6755852842809364, + "grad_norm": 0.413308709859848, + "learning_rate": 0.0002, + "loss": 1.6246, + "step": 2400 + }, + { + "epoch": 2.6867335562987735, + "grad_norm": 0.4312658905982971, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 2410 + }, + { + "epoch": 2.697881828316611, + "grad_norm": 0.44714513421058655, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2420 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.49152931571006775, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 2430 + }, + { + "epoch": 2.7201783723522857, + "grad_norm": 0.49458765983581543, + "learning_rate": 0.0002, + "loss": 1.5864, + "step": 2440 + }, + { + "epoch": 2.7313266443701227, + "grad_norm": 0.47838348150253296, + "learning_rate": 0.0002, + "loss": 1.6535, + "step": 2450 + }, + { + "epoch": 2.74247491638796, + "grad_norm": 0.5781240463256836, + "learning_rate": 0.0002, + "loss": 1.6836, + "step": 2460 + }, + { + "epoch": 2.753623188405797, + "grad_norm": 0.4559851884841919, + "learning_rate": 0.0002, + "loss": 1.6141, + "step": 2470 + }, + { + "epoch": 2.7647714604236344, + "grad_norm": 0.4452647566795349, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 2480 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.43920454382896423, + "learning_rate": 0.0002, + "loss": 1.6209, + "step": 2490 + }, + { + "epoch": 2.787068004459309, + "grad_norm": 0.467780739068985, + "learning_rate": 0.0002, + "loss": 1.5593, + "step": 2500 + }, + { + "epoch": 2.798216276477146, + "grad_norm": 0.4743262529373169, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2510 + }, + { + "epoch": 2.809364548494983, + "grad_norm": 0.47944432497024536, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2520 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 0.48032790422439575, + "learning_rate": 0.0002, + "loss": 1.6756, + "step": 2530 + }, + { + "epoch": 2.831661092530658, + "grad_norm": 0.45569729804992676, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 2540 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.47940587997436523, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2550 + }, + { + "epoch": 2.8539576365663324, + "grad_norm": 0.5215432047843933, + "learning_rate": 0.0002, + "loss": 1.6286, + "step": 2560 + }, + { + "epoch": 2.8651059085841695, + "grad_norm": 0.4421178102493286, + "learning_rate": 0.0002, + "loss": 1.6718, + "step": 2570 + }, + { + "epoch": 2.8762541806020065, + "grad_norm": 0.45288747549057007, + "learning_rate": 0.0002, + "loss": 1.6201, + "step": 2580 + }, + { + "epoch": 2.887402452619844, + "grad_norm": 0.4472251832485199, + "learning_rate": 0.0002, + "loss": 1.5938, + "step": 2590 + }, + { + "epoch": 2.898550724637681, + "grad_norm": 0.4396503269672394, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 2600 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.48590990900993347, + "learning_rate": 0.0002, + "loss": 1.6503, + "step": 2610 + }, + { + "epoch": 2.9208472686733558, + "grad_norm": 0.4787760376930237, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 2620 + }, + { + "epoch": 2.931995540691193, + "grad_norm": 0.4807611107826233, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 2630 + }, + { + "epoch": 2.94314381270903, + "grad_norm": 0.4625583291053772, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 2.9542920847268674, + "grad_norm": 0.4163573980331421, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2650 + }, + { + "epoch": 2.9654403567447045, + "grad_norm": 0.5142832398414612, + "learning_rate": 0.0002, + "loss": 1.6321, + "step": 2660 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.4459492564201355, + "learning_rate": 0.0002, + "loss": 1.6183, + "step": 2670 + }, + { + "epoch": 2.987736900780379, + "grad_norm": 0.42905503511428833, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2680 + }, + { + "epoch": 2.998885172798216, + "grad_norm": 0.44594648480415344, + "learning_rate": 0.0002, + "loss": 1.6796, + "step": 2690 + }, + { + "epoch": 3.0, + "eval_loss": 1.8300215005874634, + "eval_runtime": 38.0349, + "eval_samples_per_second": 13.54, + "eval_steps_per_second": 1.709, + "step": 2691 + }, + { + "epoch": 3.0100334448160537, + "grad_norm": 0.4742245078086853, + "learning_rate": 0.0002, + "loss": 1.5768, + "step": 2700 + }, + { + "epoch": 3.021181716833891, + "grad_norm": 0.5157448649406433, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 2710 + }, + { + "epoch": 3.032329988851728, + "grad_norm": 0.5634726285934448, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 2720 + }, + { + "epoch": 3.0434782608695654, + "grad_norm": 0.4554799199104309, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2730 + }, + { + "epoch": 3.0546265328874025, + "grad_norm": 0.6565208435058594, + "learning_rate": 0.0002, + "loss": 1.4784, + "step": 2740 + }, + { + "epoch": 3.0657748049052396, + "grad_norm": 0.6174370050430298, + "learning_rate": 0.0002, + "loss": 1.459, + "step": 2750 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.4987483024597168, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 2760 + }, + { + "epoch": 3.088071348940914, + "grad_norm": 0.5810927152633667, + "learning_rate": 0.0002, + "loss": 1.5466, + "step": 2770 + }, + { + "epoch": 3.0992196209587513, + "grad_norm": 0.5281634330749512, + "learning_rate": 0.0002, + "loss": 1.4936, + "step": 2780 + }, + { + "epoch": 3.1103678929765888, + "grad_norm": 0.5479053854942322, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 2790 + }, + { + "epoch": 3.121516164994426, + "grad_norm": 0.6192978620529175, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2800 + }, + { + "epoch": 3.132664437012263, + "grad_norm": 0.560117781162262, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2810 + }, + { + "epoch": 3.1438127090301005, + "grad_norm": 0.6067224740982056, + "learning_rate": 0.0002, + "loss": 1.5495, + "step": 2820 + }, + { + "epoch": 3.1549609810479375, + "grad_norm": 0.611287534236908, + "learning_rate": 0.0002, + "loss": 1.5239, + "step": 2830 + }, + { + "epoch": 3.1661092530657746, + "grad_norm": 0.6441587209701538, + "learning_rate": 0.0002, + "loss": 1.4577, + "step": 2840 + }, + { + "epoch": 3.177257525083612, + "grad_norm": 0.5955114364624023, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2850 + }, + { + "epoch": 3.1884057971014492, + "grad_norm": 0.5554782748222351, + "learning_rate": 0.0002, + "loss": 1.5222, + "step": 2860 + }, + { + "epoch": 3.1995540691192863, + "grad_norm": 0.5411370992660522, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 2870 + }, + { + "epoch": 3.210702341137124, + "grad_norm": 0.6152016520500183, + "learning_rate": 0.0002, + "loss": 1.5008, + "step": 2880 + }, + { + "epoch": 3.221850613154961, + "grad_norm": 0.5711581110954285, + "learning_rate": 0.0002, + "loss": 1.5229, + "step": 2890 + }, + { + "epoch": 3.2329988851727984, + "grad_norm": 0.5399307012557983, + "learning_rate": 0.0002, + "loss": 1.5255, + "step": 2900 + }, + { + "epoch": 3.2441471571906355, + "grad_norm": 0.60606849193573, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 2910 + }, + { + "epoch": 3.2552954292084726, + "grad_norm": 0.5873523950576782, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 2920 + }, + { + "epoch": 3.26644370122631, + "grad_norm": 0.6149439215660095, + "learning_rate": 0.0002, + "loss": 1.5208, + "step": 2930 + }, + { + "epoch": 3.277591973244147, + "grad_norm": 0.5940659046173096, + "learning_rate": 0.0002, + "loss": 1.4942, + "step": 2940 + }, + { + "epoch": 3.2887402452619843, + "grad_norm": 0.6846756339073181, + "learning_rate": 0.0002, + "loss": 1.5031, + "step": 2950 + }, + { + "epoch": 3.299888517279822, + "grad_norm": 0.6708254218101501, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2960 + }, + { + "epoch": 3.311036789297659, + "grad_norm": 0.5966503620147705, + "learning_rate": 0.0002, + "loss": 1.5319, + "step": 2970 + }, + { + "epoch": 3.322185061315496, + "grad_norm": 0.6328812837600708, + "learning_rate": 0.0002, + "loss": 1.5173, + "step": 2980 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6082745790481567, + "learning_rate": 0.0002, + "loss": 1.5096, + "step": 2990 + }, + { + "epoch": 3.3444816053511706, + "grad_norm": 0.6207539439201355, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 3000 + }, + { + "epoch": 3.3556298773690076, + "grad_norm": 0.5501793026924133, + "learning_rate": 0.0002, + "loss": 1.5053, + "step": 3010 + }, + { + "epoch": 3.366778149386845, + "grad_norm": 0.571275532245636, + "learning_rate": 0.0002, + "loss": 1.4428, + "step": 3020 + }, + { + "epoch": 3.3779264214046822, + "grad_norm": 0.7003518342971802, + "learning_rate": 0.0002, + "loss": 1.5914, + "step": 3030 + }, + { + "epoch": 3.3890746934225193, + "grad_norm": 0.609527587890625, + "learning_rate": 0.0002, + "loss": 1.5359, + "step": 3040 + }, + { + "epoch": 3.400222965440357, + "grad_norm": 0.5880036354064941, + "learning_rate": 0.0002, + "loss": 1.5072, + "step": 3050 + }, + { + "epoch": 3.411371237458194, + "grad_norm": 0.5847334265708923, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 3060 + }, + { + "epoch": 3.4225195094760315, + "grad_norm": 0.5373924970626831, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 3070 + }, + { + "epoch": 3.4336677814938685, + "grad_norm": 0.6074833869934082, + "learning_rate": 0.0002, + "loss": 1.5215, + "step": 3080 + }, + { + "epoch": 3.4448160535117056, + "grad_norm": 0.5118414163589478, + "learning_rate": 0.0002, + "loss": 1.458, + "step": 3090 + }, + { + "epoch": 3.4559643255295427, + "grad_norm": 0.5577956438064575, + "learning_rate": 0.0002, + "loss": 1.5006, + "step": 3100 + }, + { + "epoch": 3.46711259754738, + "grad_norm": 0.5654811859130859, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3110 + }, + { + "epoch": 3.4782608695652173, + "grad_norm": 0.6216017603874207, + "learning_rate": 0.0002, + "loss": 1.523, + "step": 3120 + }, + { + "epoch": 3.489409141583055, + "grad_norm": 0.5983642339706421, + "learning_rate": 0.0002, + "loss": 1.5292, + "step": 3130 + }, + { + "epoch": 3.500557413600892, + "grad_norm": 0.6635708212852478, + "learning_rate": 0.0002, + "loss": 1.5568, + "step": 3140 + }, + { + "epoch": 3.511705685618729, + "grad_norm": 0.6254258751869202, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 3150 + }, + { + "epoch": 3.522853957636566, + "grad_norm": 0.6359851360321045, + "learning_rate": 0.0002, + "loss": 1.4934, + "step": 3160 + }, + { + "epoch": 3.5340022296544036, + "grad_norm": 0.5938616394996643, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 3170 + }, + { + "epoch": 3.5451505016722407, + "grad_norm": 0.6360630393028259, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 3180 + }, + { + "epoch": 3.556298773690078, + "grad_norm": 0.6097670197486877, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 3190 + }, + { + "epoch": 3.5674470457079153, + "grad_norm": 0.5984025597572327, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3200 + }, + { + "epoch": 3.5785953177257523, + "grad_norm": 0.5463748574256897, + "learning_rate": 0.0002, + "loss": 1.4741, + "step": 3210 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 1.0017699003219604, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3220 + }, + { + "epoch": 3.600891861761427, + "grad_norm": 0.6519441604614258, + "learning_rate": 0.0002, + "loss": 1.5687, + "step": 3230 + }, + { + "epoch": 3.6120401337792645, + "grad_norm": 0.6457271575927734, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 3240 + }, + { + "epoch": 3.6231884057971016, + "grad_norm": 0.5898868441581726, + "learning_rate": 0.0002, + "loss": 1.5511, + "step": 3250 + }, + { + "epoch": 3.6343366778149386, + "grad_norm": 0.6612270474433899, + "learning_rate": 0.0002, + "loss": 1.5833, + "step": 3260 + }, + { + "epoch": 3.6454849498327757, + "grad_norm": 0.5102090239524841, + "learning_rate": 0.0002, + "loss": 1.4537, + "step": 3270 + }, + { + "epoch": 3.6566332218506132, + "grad_norm": 0.5357231497764587, + "learning_rate": 0.0002, + "loss": 1.4676, + "step": 3280 + }, + { + "epoch": 3.6677814938684503, + "grad_norm": 0.6176130175590515, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3290 + }, + { + "epoch": 3.678929765886288, + "grad_norm": 0.6384354829788208, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 3300 + }, + { + "epoch": 3.690078037904125, + "grad_norm": 0.5493269562721252, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3310 + }, + { + "epoch": 3.701226309921962, + "grad_norm": 0.5721797943115234, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 3320 + }, + { + "epoch": 3.712374581939799, + "grad_norm": 0.6667633056640625, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 3330 + }, + { + "epoch": 3.7235228539576366, + "grad_norm": 0.5713372826576233, + "learning_rate": 0.0002, + "loss": 1.5372, + "step": 3340 + }, + { + "epoch": 3.7346711259754737, + "grad_norm": 0.5925018191337585, + "learning_rate": 0.0002, + "loss": 1.5959, + "step": 3350 + }, + { + "epoch": 3.745819397993311, + "grad_norm": 0.5660955905914307, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 3360 + }, + { + "epoch": 3.7569676700111483, + "grad_norm": 0.5470759868621826, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 3370 + }, + { + "epoch": 3.7681159420289854, + "grad_norm": 0.7612935900688171, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3380 + }, + { + "epoch": 3.779264214046823, + "grad_norm": 0.577467679977417, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3390 + }, + { + "epoch": 3.79041248606466, + "grad_norm": 0.6125091910362244, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3400 + }, + { + "epoch": 3.801560758082497, + "grad_norm": 0.590386152267456, + "learning_rate": 0.0002, + "loss": 1.5463, + "step": 3410 + }, + { + "epoch": 3.8127090301003346, + "grad_norm": 0.5530361533164978, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 3420 + }, + { + "epoch": 3.8238573021181717, + "grad_norm": 0.5714079737663269, + "learning_rate": 0.0002, + "loss": 1.4797, + "step": 3430 + }, + { + "epoch": 3.8350055741360087, + "grad_norm": 0.9061086773872375, + "learning_rate": 0.0002, + "loss": 1.5324, + "step": 3440 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.6193320751190186, + "learning_rate": 0.0002, + "loss": 1.4513, + "step": 3450 + }, + { + "epoch": 3.8573021181716833, + "grad_norm": 0.5831704139709473, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3460 + }, + { + "epoch": 3.868450390189521, + "grad_norm": 0.5971192717552185, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 3470 + }, + { + "epoch": 3.879598662207358, + "grad_norm": 0.6110154390335083, + "learning_rate": 0.0002, + "loss": 1.484, + "step": 3480 + }, + { + "epoch": 3.890746934225195, + "grad_norm": 0.6644453406333923, + "learning_rate": 0.0002, + "loss": 1.5624, + "step": 3490 + }, + { + "epoch": 3.901895206243032, + "grad_norm": 0.6674908399581909, + "learning_rate": 0.0002, + "loss": 1.5422, + "step": 3500 + }, + { + "epoch": 3.9130434782608696, + "grad_norm": 0.5516519546508789, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3510 + }, + { + "epoch": 3.9241917502787067, + "grad_norm": 0.6704319715499878, + "learning_rate": 0.0002, + "loss": 1.5964, + "step": 3520 + }, + { + "epoch": 3.9353400222965442, + "grad_norm": 0.5820314288139343, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 3530 + }, + { + "epoch": 3.9464882943143813, + "grad_norm": 0.6931548714637756, + "learning_rate": 0.0002, + "loss": 1.6458, + "step": 3540 + }, + { + "epoch": 3.9576365663322184, + "grad_norm": 0.6085171103477478, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 3550 + }, + { + "epoch": 3.9687848383500555, + "grad_norm": 0.5973535776138306, + "learning_rate": 0.0002, + "loss": 1.5537, + "step": 3560 + }, + { + "epoch": 3.979933110367893, + "grad_norm": 0.49761658906936646, + "learning_rate": 0.0002, + "loss": 1.5435, + "step": 3570 + }, + { + "epoch": 3.99108138238573, + "grad_norm": 0.6282512545585632, + "learning_rate": 0.0002, + "loss": 1.488, + "step": 3580 + }, + { + "epoch": 4.0, + "eval_loss": 1.8790398836135864, + "eval_runtime": 37.9725, + "eval_samples_per_second": 13.562, + "eval_steps_per_second": 1.712, + "step": 3588 + }, + { + "epoch": 4.002229654403568, + "grad_norm": 0.6402973532676697, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3590 + }, + { + "epoch": 4.013377926421405, + "grad_norm": 0.7791030406951904, + "learning_rate": 0.0002, + "loss": 1.3695, + "step": 3600 + }, + { + "epoch": 4.024526198439242, + "grad_norm": 0.7136624455451965, + "learning_rate": 0.0002, + "loss": 1.3545, + "step": 3610 + }, + { + "epoch": 4.035674470457079, + "grad_norm": 0.7608486413955688, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 3620 + }, + { + "epoch": 4.046822742474917, + "grad_norm": 0.7486591935157776, + "learning_rate": 0.0002, + "loss": 1.3067, + "step": 3630 + }, + { + "epoch": 4.057971014492754, + "grad_norm": 0.7576302289962769, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 3640 + }, + { + "epoch": 4.069119286510591, + "grad_norm": 0.7358254194259644, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3650 + }, + { + "epoch": 4.080267558528428, + "grad_norm": 0.821326494216919, + "learning_rate": 0.0002, + "loss": 1.3015, + "step": 3660 + }, + { + "epoch": 4.091415830546265, + "grad_norm": 0.7996482253074646, + "learning_rate": 0.0002, + "loss": 1.4186, + "step": 3670 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 0.8527022004127502, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3680 + }, + { + "epoch": 4.11371237458194, + "grad_norm": 0.7313576340675354, + "learning_rate": 0.0002, + "loss": 1.3818, + "step": 3690 + }, + { + "epoch": 4.124860646599777, + "grad_norm": 0.7854588627815247, + "learning_rate": 0.0002, + "loss": 1.3307, + "step": 3700 + }, + { + "epoch": 4.136008918617614, + "grad_norm": 0.6588303446769714, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 3710 + }, + { + "epoch": 4.147157190635451, + "grad_norm": 0.7986254692077637, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3720 + }, + { + "epoch": 4.1583054626532885, + "grad_norm": 0.6864156126976013, + "learning_rate": 0.0002, + "loss": 1.3505, + "step": 3730 + }, + { + "epoch": 4.169453734671126, + "grad_norm": 0.8197885155677795, + "learning_rate": 0.0002, + "loss": 1.2987, + "step": 3740 + }, + { + "epoch": 4.1806020066889635, + "grad_norm": 0.7169402837753296, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 3750 + }, + { + "epoch": 4.191750278706801, + "grad_norm": 0.7948839068412781, + "learning_rate": 0.0002, + "loss": 1.4388, + "step": 3760 + }, + { + "epoch": 4.202898550724638, + "grad_norm": 0.6775302290916443, + "learning_rate": 0.0002, + "loss": 1.4648, + "step": 3770 + }, + { + "epoch": 4.214046822742475, + "grad_norm": 0.8913543820381165, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3780 + }, + { + "epoch": 4.225195094760312, + "grad_norm": 0.8046368360519409, + "learning_rate": 0.0002, + "loss": 1.4251, + "step": 3790 + }, + { + "epoch": 4.236343366778149, + "grad_norm": 0.9359563589096069, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3800 + }, + { + "epoch": 4.247491638795987, + "grad_norm": 0.8012228608131409, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 3810 + }, + { + "epoch": 4.258639910813824, + "grad_norm": 0.8405851125717163, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 3820 + }, + { + "epoch": 4.269788182831661, + "grad_norm": 0.7812899351119995, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 3830 + }, + { + "epoch": 4.280936454849498, + "grad_norm": 0.8192463517189026, + "learning_rate": 0.0002, + "loss": 1.4006, + "step": 3840 + }, + { + "epoch": 4.292084726867335, + "grad_norm": 0.6937220096588135, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 3850 + }, + { + "epoch": 4.303232998885173, + "grad_norm": 0.7245703935623169, + "learning_rate": 0.0002, + "loss": 1.391, + "step": 3860 + }, + { + "epoch": 4.31438127090301, + "grad_norm": 0.7816787362098694, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 3870 + }, + { + "epoch": 4.325529542920847, + "grad_norm": 0.7904975414276123, + "learning_rate": 0.0002, + "loss": 1.4316, + "step": 3880 + }, + { + "epoch": 4.336677814938684, + "grad_norm": 1.0394847393035889, + "learning_rate": 0.0002, + "loss": 1.4722, + "step": 3890 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 0.7044078707695007, + "learning_rate": 0.0002, + "loss": 1.4574, + "step": 3900 + }, + { + "epoch": 4.358974358974359, + "grad_norm": 0.8852819204330444, + "learning_rate": 0.0002, + "loss": 1.3185, + "step": 3910 + }, + { + "epoch": 4.3701226309921966, + "grad_norm": 0.7712758779525757, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3920 + }, + { + "epoch": 4.381270903010034, + "grad_norm": 0.7677774429321289, + "learning_rate": 0.0002, + "loss": 1.3519, + "step": 3930 + }, + { + "epoch": 4.392419175027871, + "grad_norm": 0.7450921535491943, + "learning_rate": 0.0002, + "loss": 1.3693, + "step": 3940 + }, + { + "epoch": 4.403567447045708, + "grad_norm": 0.7802795767784119, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 3950 + }, + { + "epoch": 4.414715719063545, + "grad_norm": 0.8976508378982544, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 3960 + }, + { + "epoch": 4.425863991081382, + "grad_norm": 0.8148922324180603, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3970 + }, + { + "epoch": 4.43701226309922, + "grad_norm": 0.7490504384040833, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 3980 + }, + { + "epoch": 4.448160535117057, + "grad_norm": 0.753652036190033, + "learning_rate": 0.0002, + "loss": 1.393, + "step": 3990 + }, + { + "epoch": 4.459308807134894, + "grad_norm": 0.803986668586731, + "learning_rate": 0.0002, + "loss": 1.3467, + "step": 4000 + }, + { + "epoch": 4.470457079152731, + "grad_norm": 0.8643081784248352, + "learning_rate": 0.0002, + "loss": 1.3872, + "step": 4010 + }, + { + "epoch": 4.481605351170568, + "grad_norm": 0.8298280835151672, + "learning_rate": 0.0002, + "loss": 1.407, + "step": 4020 + }, + { + "epoch": 4.492753623188406, + "grad_norm": 0.705355703830719, + "learning_rate": 0.0002, + "loss": 1.4555, + "step": 4030 + }, + { + "epoch": 4.503901895206243, + "grad_norm": 0.7845711708068848, + "learning_rate": 0.0002, + "loss": 1.3646, + "step": 4040 + }, + { + "epoch": 4.51505016722408, + "grad_norm": 0.8056256175041199, + "learning_rate": 0.0002, + "loss": 1.3913, + "step": 4050 + }, + { + "epoch": 4.5261984392419174, + "grad_norm": 0.7080171704292297, + "learning_rate": 0.0002, + "loss": 1.3716, + "step": 4060 + }, + { + "epoch": 4.5373467112597545, + "grad_norm": 0.778388261795044, + "learning_rate": 0.0002, + "loss": 1.335, + "step": 4070 + }, + { + "epoch": 4.548494983277592, + "grad_norm": 0.7337639927864075, + "learning_rate": 0.0002, + "loss": 1.3921, + "step": 4080 + }, + { + "epoch": 4.55964325529543, + "grad_norm": 0.815322756767273, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4090 + }, + { + "epoch": 4.570791527313267, + "grad_norm": 0.8817179203033447, + "learning_rate": 0.0002, + "loss": 1.4509, + "step": 4100 + }, + { + "epoch": 4.581939799331104, + "grad_norm": 0.7526060342788696, + "learning_rate": 0.0002, + "loss": 1.344, + "step": 4110 + }, + { + "epoch": 4.593088071348941, + "grad_norm": 0.920465350151062, + "learning_rate": 0.0002, + "loss": 1.4027, + "step": 4120 + }, + { + "epoch": 4.604236343366778, + "grad_norm": 0.7509559392929077, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4130 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.799469530582428, + "learning_rate": 0.0002, + "loss": 1.4064, + "step": 4140 + }, + { + "epoch": 4.626532887402453, + "grad_norm": 0.8099892735481262, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4150 + }, + { + "epoch": 4.63768115942029, + "grad_norm": 0.7790375351905823, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4160 + }, + { + "epoch": 4.648829431438127, + "grad_norm": 0.8292977809906006, + "learning_rate": 0.0002, + "loss": 1.4626, + "step": 4170 + }, + { + "epoch": 4.659977703455964, + "grad_norm": 0.8312386274337769, + "learning_rate": 0.0002, + "loss": 1.4505, + "step": 4180 + }, + { + "epoch": 4.671125975473801, + "grad_norm": 0.7348753809928894, + "learning_rate": 0.0002, + "loss": 1.4301, + "step": 4190 + }, + { + "epoch": 4.682274247491639, + "grad_norm": 0.8006551265716553, + "learning_rate": 0.0002, + "loss": 1.4074, + "step": 4200 + }, + { + "epoch": 4.693422519509476, + "grad_norm": 0.8477752804756165, + "learning_rate": 0.0002, + "loss": 1.4349, + "step": 4210 + }, + { + "epoch": 4.704570791527313, + "grad_norm": 0.7056546211242676, + "learning_rate": 0.0002, + "loss": 1.3943, + "step": 4220 + }, + { + "epoch": 4.7157190635451505, + "grad_norm": 0.7858873009681702, + "learning_rate": 0.0002, + "loss": 1.3415, + "step": 4230 + }, + { + "epoch": 4.7268673355629875, + "grad_norm": 0.6968740224838257, + "learning_rate": 0.0002, + "loss": 1.3644, + "step": 4240 + }, + { + "epoch": 4.738015607580825, + "grad_norm": 0.7886689901351929, + "learning_rate": 0.0002, + "loss": 1.3594, + "step": 4250 + }, + { + "epoch": 4.749163879598662, + "grad_norm": 0.8935304880142212, + "learning_rate": 0.0002, + "loss": 1.3783, + "step": 4260 + }, + { + "epoch": 4.7603121516165, + "grad_norm": 0.8395553231239319, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 4270 + }, + { + "epoch": 4.771460423634337, + "grad_norm": 0.817263126373291, + "learning_rate": 0.0002, + "loss": 1.4113, + "step": 4280 + }, + { + "epoch": 4.782608695652174, + "grad_norm": 0.7912008166313171, + "learning_rate": 0.0002, + "loss": 1.4181, + "step": 4290 + }, + { + "epoch": 4.793756967670011, + "grad_norm": 0.6637866497039795, + "learning_rate": 0.0002, + "loss": 1.4369, + "step": 4300 + }, + { + "epoch": 4.804905239687848, + "grad_norm": 1.0709338188171387, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4310 + }, + { + "epoch": 4.816053511705686, + "grad_norm": 0.8179698586463928, + "learning_rate": 0.0002, + "loss": 1.4635, + "step": 4320 + }, + { + "epoch": 4.827201783723523, + "grad_norm": 0.7952052354812622, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 4330 + }, + { + "epoch": 4.83835005574136, + "grad_norm": 0.7235367894172668, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 4340 + }, + { + "epoch": 4.849498327759197, + "grad_norm": 0.8484606742858887, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 4350 + }, + { + "epoch": 4.860646599777034, + "grad_norm": 0.7344942092895508, + "learning_rate": 0.0002, + "loss": 1.3898, + "step": 4360 + }, + { + "epoch": 4.871794871794872, + "grad_norm": 0.9718546867370605, + "learning_rate": 0.0002, + "loss": 1.4519, + "step": 4370 + }, + { + "epoch": 4.882943143812709, + "grad_norm": 0.8174259066581726, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 4380 + }, + { + "epoch": 4.894091415830546, + "grad_norm": 0.8097165822982788, + "learning_rate": 0.0002, + "loss": 1.3244, + "step": 4390 + }, + { + "epoch": 4.9052396878483835, + "grad_norm": 0.756388783454895, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 4400 + }, + { + "epoch": 4.916387959866221, + "grad_norm": 0.8324617743492126, + "learning_rate": 0.0002, + "loss": 1.4129, + "step": 4410 + }, + { + "epoch": 4.927536231884058, + "grad_norm": 0.8949803709983826, + "learning_rate": 0.0002, + "loss": 1.3662, + "step": 4420 + }, + { + "epoch": 4.938684503901895, + "grad_norm": 0.7663722634315491, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 4430 + }, + { + "epoch": 4.949832775919733, + "grad_norm": 0.7727946043014526, + "learning_rate": 0.0002, + "loss": 1.3829, + "step": 4440 + }, + { + "epoch": 4.96098104793757, + "grad_norm": 0.6872350573539734, + "learning_rate": 0.0002, + "loss": 1.4351, + "step": 4450 + }, + { + "epoch": 4.972129319955407, + "grad_norm": 0.754357099533081, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 4460 + }, + { + "epoch": 4.983277591973244, + "grad_norm": 0.8068729639053345, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 4470 + }, + { + "epoch": 4.994425863991081, + "grad_norm": 0.8200556635856628, + "learning_rate": 0.0002, + "loss": 1.3891, + "step": 4480 + }, + { + "epoch": 5.0, + "eval_loss": 1.9543706178665161, + "eval_runtime": 37.9369, + "eval_samples_per_second": 13.575, + "eval_steps_per_second": 1.713, + "step": 4485 + }, + { + "epoch": 5.005574136008919, + "grad_norm": 0.7499465942382812, + "learning_rate": 0.0002, + "loss": 1.3194, + "step": 4490 + }, + { + "epoch": 5.016722408026756, + "grad_norm": 1.030434489250183, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 4500 + }, + { + "epoch": 5.027870680044593, + "grad_norm": 0.8914631605148315, + "learning_rate": 0.0002, + "loss": 1.2408, + "step": 4510 + }, + { + "epoch": 5.03901895206243, + "grad_norm": 0.9902928471565247, + "learning_rate": 0.0002, + "loss": 1.1448, + "step": 4520 + }, + { + "epoch": 5.050167224080267, + "grad_norm": 0.8338701128959656, + "learning_rate": 0.0002, + "loss": 1.2401, + "step": 4530 + }, + { + "epoch": 5.061315496098104, + "grad_norm": 0.9440169334411621, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 4540 + }, + { + "epoch": 5.072463768115942, + "grad_norm": 0.8755099177360535, + "learning_rate": 0.0002, + "loss": 1.2196, + "step": 4550 + }, + { + "epoch": 5.083612040133779, + "grad_norm": 0.9145820140838623, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 4560 + }, + { + "epoch": 5.0947603121516165, + "grad_norm": 1.0068492889404297, + "learning_rate": 0.0002, + "loss": 1.147, + "step": 4570 + }, + { + "epoch": 5.105908584169454, + "grad_norm": 0.9184673428535461, + "learning_rate": 0.0002, + "loss": 1.2192, + "step": 4580 + }, + { + "epoch": 5.117056856187291, + "grad_norm": 1.1158655881881714, + "learning_rate": 0.0002, + "loss": 1.2948, + "step": 4590 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 0.9685078263282776, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 4600 + }, + { + "epoch": 5.139353400222966, + "grad_norm": 1.0389559268951416, + "learning_rate": 0.0002, + "loss": 1.2654, + "step": 4610 + }, + { + "epoch": 5.150501672240803, + "grad_norm": 1.0294485092163086, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4620 + }, + { + "epoch": 5.16164994425864, + "grad_norm": 0.9368783235549927, + "learning_rate": 0.0002, + "loss": 1.296, + "step": 4630 + }, + { + "epoch": 5.172798216276477, + "grad_norm": 0.9724945425987244, + "learning_rate": 0.0002, + "loss": 1.206, + "step": 4640 + }, + { + "epoch": 5.183946488294314, + "grad_norm": 0.876488447189331, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 4650 + }, + { + "epoch": 5.195094760312152, + "grad_norm": 0.9106290340423584, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 4660 + }, + { + "epoch": 5.206243032329989, + "grad_norm": 1.0924615859985352, + "learning_rate": 0.0002, + "loss": 1.2896, + "step": 4670 + }, + { + "epoch": 5.217391304347826, + "grad_norm": 1.0379078388214111, + "learning_rate": 0.0002, + "loss": 1.245, + "step": 4680 + }, + { + "epoch": 5.228539576365663, + "grad_norm": 0.9507831931114197, + "learning_rate": 0.0002, + "loss": 1.2155, + "step": 4690 + }, + { + "epoch": 5.2396878483835, + "grad_norm": 1.0408620834350586, + "learning_rate": 0.0002, + "loss": 1.2318, + "step": 4700 + }, + { + "epoch": 5.250836120401337, + "grad_norm": 0.9463635087013245, + "learning_rate": 0.0002, + "loss": 1.1819, + "step": 4710 + }, + { + "epoch": 5.261984392419175, + "grad_norm": 0.8919326663017273, + "learning_rate": 0.0002, + "loss": 1.1951, + "step": 4720 + }, + { + "epoch": 5.2731326644370125, + "grad_norm": 1.0364950895309448, + "learning_rate": 0.0002, + "loss": 1.228, + "step": 4730 + }, + { + "epoch": 5.2842809364548495, + "grad_norm": 1.0225472450256348, + "learning_rate": 0.0002, + "loss": 1.2543, + "step": 4740 + }, + { + "epoch": 5.295429208472687, + "grad_norm": 0.816410481929779, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 4750 + }, + { + "epoch": 5.306577480490524, + "grad_norm": 1.0793992280960083, + "learning_rate": 0.0002, + "loss": 1.3601, + "step": 4760 + }, + { + "epoch": 5.317725752508361, + "grad_norm": 1.0203443765640259, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4770 + }, + { + "epoch": 5.328874024526199, + "grad_norm": 1.0731306076049805, + "learning_rate": 0.0002, + "loss": 1.239, + "step": 4780 + }, + { + "epoch": 5.340022296544036, + "grad_norm": 0.9282820224761963, + "learning_rate": 0.0002, + "loss": 1.2893, + "step": 4790 + }, + { + "epoch": 5.351170568561873, + "grad_norm": 0.9741092920303345, + "learning_rate": 0.0002, + "loss": 1.2159, + "step": 4800 + }, + { + "epoch": 5.36231884057971, + "grad_norm": 1.0683609247207642, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 4810 + }, + { + "epoch": 5.373467112597547, + "grad_norm": 0.9035003781318665, + "learning_rate": 0.0002, + "loss": 1.2316, + "step": 4820 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 1.0590119361877441, + "learning_rate": 0.0002, + "loss": 1.2615, + "step": 4830 + }, + { + "epoch": 5.395763656633222, + "grad_norm": 0.9782686829566956, + "learning_rate": 0.0002, + "loss": 1.2089, + "step": 4840 + }, + { + "epoch": 5.406911928651059, + "grad_norm": 1.036087155342102, + "learning_rate": 0.0002, + "loss": 1.3019, + "step": 4850 + }, + { + "epoch": 5.418060200668896, + "grad_norm": 0.9999949932098389, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 4860 + }, + { + "epoch": 5.429208472686733, + "grad_norm": 0.9094445109367371, + "learning_rate": 0.0002, + "loss": 1.3014, + "step": 4870 + }, + { + "epoch": 5.44035674470457, + "grad_norm": 0.9079708456993103, + "learning_rate": 0.0002, + "loss": 1.2013, + "step": 4880 + }, + { + "epoch": 5.451505016722408, + "grad_norm": 1.0426156520843506, + "learning_rate": 0.0002, + "loss": 1.2224, + "step": 4890 + }, + { + "epoch": 5.4626532887402455, + "grad_norm": 1.0110737085342407, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 4900 + }, + { + "epoch": 5.4738015607580826, + "grad_norm": 1.0994000434875488, + "learning_rate": 0.0002, + "loss": 1.2178, + "step": 4910 + }, + { + "epoch": 5.48494983277592, + "grad_norm": 0.8988325595855713, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 4920 + }, + { + "epoch": 5.496098104793757, + "grad_norm": 1.0705887079238892, + "learning_rate": 0.0002, + "loss": 1.2694, + "step": 4930 + }, + { + "epoch": 5.507246376811594, + "grad_norm": 1.0268803834915161, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 4940 + }, + { + "epoch": 5.518394648829432, + "grad_norm": 1.0129153728485107, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 4950 + }, + { + "epoch": 5.529542920847269, + "grad_norm": 1.122117280960083, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 4960 + }, + { + "epoch": 5.540691192865106, + "grad_norm": 1.0318635702133179, + "learning_rate": 0.0002, + "loss": 1.2828, + "step": 4970 + }, + { + "epoch": 5.551839464882943, + "grad_norm": 0.9340117573738098, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 4980 + }, + { + "epoch": 5.56298773690078, + "grad_norm": 0.9427006244659424, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 4990 + }, + { + "epoch": 5.574136008918618, + "grad_norm": 1.1786518096923828, + "learning_rate": 0.0002, + "loss": 1.2911, + "step": 5000 + }, + { + "epoch": 5.585284280936455, + "grad_norm": 1.045157551765442, + "learning_rate": 0.0002, + "loss": 1.2279, + "step": 5010 + }, + { + "epoch": 5.596432552954292, + "grad_norm": 1.0475151538848877, + "learning_rate": 0.0002, + "loss": 1.2269, + "step": 5020 + }, + { + "epoch": 5.607580824972129, + "grad_norm": 1.040969729423523, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 5030 + }, + { + "epoch": 5.618729096989966, + "grad_norm": 0.9610048532485962, + "learning_rate": 0.0002, + "loss": 1.2134, + "step": 5040 + }, + { + "epoch": 5.6298773690078034, + "grad_norm": 0.9774818420410156, + "learning_rate": 0.0002, + "loss": 1.1657, + "step": 5050 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 0.8715312480926514, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 5060 + }, + { + "epoch": 5.6521739130434785, + "grad_norm": 0.9484505653381348, + "learning_rate": 0.0002, + "loss": 1.3077, + "step": 5070 + }, + { + "epoch": 5.663322185061316, + "grad_norm": 0.8292845487594604, + "learning_rate": 0.0002, + "loss": 1.2787, + "step": 5080 + }, + { + "epoch": 5.674470457079153, + "grad_norm": 0.9876886606216431, + "learning_rate": 0.0002, + "loss": 1.2357, + "step": 5090 + }, + { + "epoch": 5.68561872909699, + "grad_norm": 0.9899171590805054, + "learning_rate": 0.0002, + "loss": 1.2864, + "step": 5100 + }, + { + "epoch": 5.696767001114827, + "grad_norm": 0.9693286418914795, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 5110 + }, + { + "epoch": 5.707915273132665, + "grad_norm": 0.958905816078186, + "learning_rate": 0.0002, + "loss": 1.1952, + "step": 5120 + }, + { + "epoch": 5.719063545150502, + "grad_norm": 0.9924837350845337, + "learning_rate": 0.0002, + "loss": 1.2889, + "step": 5130 + }, + { + "epoch": 5.730211817168339, + "grad_norm": 0.9551714062690735, + "learning_rate": 0.0002, + "loss": 1.3057, + "step": 5140 + }, + { + "epoch": 5.741360089186176, + "grad_norm": 1.0407027006149292, + "learning_rate": 0.0002, + "loss": 1.2643, + "step": 5150 + }, + { + "epoch": 5.752508361204013, + "grad_norm": 0.9688791036605835, + "learning_rate": 0.0002, + "loss": 1.1833, + "step": 5160 + }, + { + "epoch": 5.763656633221851, + "grad_norm": 1.0091899633407593, + "learning_rate": 0.0002, + "loss": 1.1424, + "step": 5170 + }, + { + "epoch": 5.774804905239688, + "grad_norm": 0.9393984079360962, + "learning_rate": 0.0002, + "loss": 1.2575, + "step": 5180 + }, + { + "epoch": 5.785953177257525, + "grad_norm": 1.1439075469970703, + "learning_rate": 0.0002, + "loss": 1.2177, + "step": 5190 + }, + { + "epoch": 5.797101449275362, + "grad_norm": 1.0178622007369995, + "learning_rate": 0.0002, + "loss": 1.3355, + "step": 5200 + }, + { + "epoch": 5.808249721293199, + "grad_norm": 0.8440285921096802, + "learning_rate": 0.0002, + "loss": 1.3317, + "step": 5210 + }, + { + "epoch": 5.8193979933110365, + "grad_norm": 0.856838583946228, + "learning_rate": 0.0002, + "loss": 1.3097, + "step": 5220 + }, + { + "epoch": 5.8305462653288735, + "grad_norm": 0.8676707148551941, + "learning_rate": 0.0002, + "loss": 1.3109, + "step": 5230 + }, + { + "epoch": 5.8416945373467115, + "grad_norm": 1.1034743785858154, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 5240 + }, + { + "epoch": 5.852842809364549, + "grad_norm": 0.9631003737449646, + "learning_rate": 0.0002, + "loss": 1.2473, + "step": 5250 + }, + { + "epoch": 5.863991081382386, + "grad_norm": 1.0478793382644653, + "learning_rate": 0.0002, + "loss": 1.2693, + "step": 5260 + }, + { + "epoch": 5.875139353400223, + "grad_norm": 0.9819806218147278, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 5270 + }, + { + "epoch": 5.88628762541806, + "grad_norm": 0.8572421073913574, + "learning_rate": 0.0002, + "loss": 1.2817, + "step": 5280 + }, + { + "epoch": 5.897435897435898, + "grad_norm": 0.9328814148902893, + "learning_rate": 0.0002, + "loss": 1.246, + "step": 5290 + }, + { + "epoch": 5.908584169453735, + "grad_norm": 1.000305414199829, + "learning_rate": 0.0002, + "loss": 1.3016, + "step": 5300 + }, + { + "epoch": 5.919732441471572, + "grad_norm": 1.1006377935409546, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 5310 + }, + { + "epoch": 5.930880713489409, + "grad_norm": 0.963198721408844, + "learning_rate": 0.0002, + "loss": 1.3317, + "step": 5320 + }, + { + "epoch": 5.942028985507246, + "grad_norm": 0.8952236175537109, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 5330 + }, + { + "epoch": 5.953177257525084, + "grad_norm": 1.0945496559143066, + "learning_rate": 0.0002, + "loss": 1.2536, + "step": 5340 + }, + { + "epoch": 5.964325529542921, + "grad_norm": 1.0053467750549316, + "learning_rate": 0.0002, + "loss": 1.2768, + "step": 5350 + }, + { + "epoch": 5.975473801560758, + "grad_norm": 1.032088279724121, + "learning_rate": 0.0002, + "loss": 1.3075, + "step": 5360 + }, + { + "epoch": 5.986622073578595, + "grad_norm": 1.1068958044052124, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 5370 + }, + { + "epoch": 5.997770345596432, + "grad_norm": 1.0064235925674438, + "learning_rate": 0.0002, + "loss": 1.2468, + "step": 5380 + }, + { + "epoch": 6.0, + "eval_loss": 2.0690135955810547, + "eval_runtime": 38.1748, + "eval_samples_per_second": 13.491, + "eval_steps_per_second": 1.703, + "step": 5382 + }, + { + "epoch": 6.0089186176142695, + "grad_norm": 0.9700132608413696, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 5390 + }, + { + "epoch": 6.0200668896321075, + "grad_norm": 1.159369707107544, + "learning_rate": 0.0002, + "loss": 1.097, + "step": 5400 + }, + { + "epoch": 6.0312151616499445, + "grad_norm": 1.332871913909912, + "learning_rate": 0.0002, + "loss": 1.0646, + "step": 5410 + }, + { + "epoch": 6.042363433667782, + "grad_norm": 1.2239890098571777, + "learning_rate": 0.0002, + "loss": 1.0882, + "step": 5420 + }, + { + "epoch": 6.053511705685619, + "grad_norm": 1.5238478183746338, + "learning_rate": 0.0002, + "loss": 1.0505, + "step": 5430 + }, + { + "epoch": 6.064659977703456, + "grad_norm": 1.24699068069458, + "learning_rate": 0.0002, + "loss": 1.1423, + "step": 5440 + }, + { + "epoch": 6.075808249721293, + "grad_norm": 1.0891860723495483, + "learning_rate": 0.0002, + "loss": 1.0789, + "step": 5450 + }, + { + "epoch": 6.086956521739131, + "grad_norm": 1.2695465087890625, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 5460 + }, + { + "epoch": 6.098104793756968, + "grad_norm": 1.0630067586898804, + "learning_rate": 0.0002, + "loss": 1.0728, + "step": 5470 + }, + { + "epoch": 6.109253065774805, + "grad_norm": 0.9666808247566223, + "learning_rate": 0.0002, + "loss": 1.0391, + "step": 5480 + }, + { + "epoch": 6.120401337792642, + "grad_norm": 0.8925976157188416, + "learning_rate": 0.0002, + "loss": 1.1159, + "step": 5490 + }, + { + "epoch": 6.131549609810479, + "grad_norm": 1.0824475288391113, + "learning_rate": 0.0002, + "loss": 1.0371, + "step": 5500 + }, + { + "epoch": 6.142697881828316, + "grad_norm": 1.2315316200256348, + "learning_rate": 0.0002, + "loss": 1.1568, + "step": 5510 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 1.2484779357910156, + "learning_rate": 0.0002, + "loss": 1.0896, + "step": 5520 + }, + { + "epoch": 6.164994425863991, + "grad_norm": 1.2468485832214355, + "learning_rate": 0.0002, + "loss": 1.0368, + "step": 5530 + }, + { + "epoch": 6.176142697881828, + "grad_norm": 1.0837156772613525, + "learning_rate": 0.0002, + "loss": 1.1368, + "step": 5540 + }, + { + "epoch": 6.187290969899665, + "grad_norm": 1.1650336980819702, + "learning_rate": 0.0002, + "loss": 1.1042, + "step": 5550 + }, + { + "epoch": 6.1984392419175025, + "grad_norm": 1.2004241943359375, + "learning_rate": 0.0002, + "loss": 1.0495, + "step": 5560 + }, + { + "epoch": 6.20958751393534, + "grad_norm": 1.0223793983459473, + "learning_rate": 0.0002, + "loss": 1.023, + "step": 5570 + }, + { + "epoch": 6.2207357859531776, + "grad_norm": 1.4045847654342651, + "learning_rate": 0.0002, + "loss": 1.0837, + "step": 5580 + }, + { + "epoch": 6.231884057971015, + "grad_norm": 1.3042256832122803, + "learning_rate": 0.0002, + "loss": 1.1168, + "step": 5590 + }, + { + "epoch": 6.243032329988852, + "grad_norm": 1.1762887239456177, + "learning_rate": 0.0002, + "loss": 1.0138, + "step": 5600 + }, + { + "epoch": 6.254180602006689, + "grad_norm": 1.1739851236343384, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 5610 + }, + { + "epoch": 6.265328874024526, + "grad_norm": 1.2904260158538818, + "learning_rate": 0.0002, + "loss": 1.1004, + "step": 5620 + }, + { + "epoch": 6.276477146042364, + "grad_norm": 1.3218393325805664, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 5630 + }, + { + "epoch": 6.287625418060201, + "grad_norm": 1.241175889968872, + "learning_rate": 0.0002, + "loss": 1.0876, + "step": 5640 + }, + { + "epoch": 6.298773690078038, + "grad_norm": 1.2916349172592163, + "learning_rate": 0.0002, + "loss": 1.128, + "step": 5650 + }, + { + "epoch": 6.309921962095875, + "grad_norm": 1.5129448175430298, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 5660 + }, + { + "epoch": 6.321070234113712, + "grad_norm": 1.0297393798828125, + "learning_rate": 0.0002, + "loss": 1.0723, + "step": 5670 + }, + { + "epoch": 6.332218506131549, + "grad_norm": 1.1127521991729736, + "learning_rate": 0.0002, + "loss": 1.0513, + "step": 5680 + }, + { + "epoch": 6.343366778149387, + "grad_norm": 1.0972518920898438, + "learning_rate": 0.0002, + "loss": 1.0305, + "step": 5690 + }, + { + "epoch": 6.354515050167224, + "grad_norm": 1.4237337112426758, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 5700 + }, + { + "epoch": 6.365663322185061, + "grad_norm": 1.121502161026001, + "learning_rate": 0.0002, + "loss": 1.0924, + "step": 5710 + }, + { + "epoch": 6.3768115942028984, + "grad_norm": 1.1007202863693237, + "learning_rate": 0.0002, + "loss": 1.0208, + "step": 5720 + }, + { + "epoch": 6.3879598662207355, + "grad_norm": 1.1609363555908203, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 5730 + }, + { + "epoch": 6.399108138238573, + "grad_norm": 1.3008915185928345, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 5740 + }, + { + "epoch": 6.410256410256411, + "grad_norm": 1.184460163116455, + "learning_rate": 0.0002, + "loss": 1.1647, + "step": 5750 + }, + { + "epoch": 6.421404682274248, + "grad_norm": 1.2092398405075073, + "learning_rate": 0.0002, + "loss": 1.109, + "step": 5760 + }, + { + "epoch": 6.432552954292085, + "grad_norm": 1.2273279428482056, + "learning_rate": 0.0002, + "loss": 1.093, + "step": 5770 + }, + { + "epoch": 6.443701226309922, + "grad_norm": 1.0721677541732788, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 5780 + }, + { + "epoch": 6.454849498327759, + "grad_norm": 1.1679279804229736, + "learning_rate": 0.0002, + "loss": 1.0585, + "step": 5790 + }, + { + "epoch": 6.465997770345597, + "grad_norm": 1.3658736944198608, + "learning_rate": 0.0002, + "loss": 1.0795, + "step": 5800 + }, + { + "epoch": 6.477146042363434, + "grad_norm": 1.2440944910049438, + "learning_rate": 0.0002, + "loss": 1.0951, + "step": 5810 + }, + { + "epoch": 6.488294314381271, + "grad_norm": 1.1838182210922241, + "learning_rate": 0.0002, + "loss": 1.0815, + "step": 5820 + }, + { + "epoch": 6.499442586399108, + "grad_norm": 1.1993956565856934, + "learning_rate": 0.0002, + "loss": 1.0543, + "step": 5830 + }, + { + "epoch": 6.510590858416945, + "grad_norm": 1.1028285026550293, + "learning_rate": 0.0002, + "loss": 1.1587, + "step": 5840 + }, + { + "epoch": 6.521739130434782, + "grad_norm": 1.2117441892623901, + "learning_rate": 0.0002, + "loss": 1.1245, + "step": 5850 + }, + { + "epoch": 6.53288740245262, + "grad_norm": 1.2012946605682373, + "learning_rate": 0.0002, + "loss": 1.1237, + "step": 5860 + }, + { + "epoch": 6.544035674470457, + "grad_norm": 1.2491029500961304, + "learning_rate": 0.0002, + "loss": 1.1038, + "step": 5870 + }, + { + "epoch": 6.555183946488294, + "grad_norm": 1.4130326509475708, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 5880 + }, + { + "epoch": 6.5663322185061315, + "grad_norm": 1.2596930265426636, + "learning_rate": 0.0002, + "loss": 1.1094, + "step": 5890 + }, + { + "epoch": 6.5774804905239685, + "grad_norm": 1.32266104221344, + "learning_rate": 0.0002, + "loss": 1.1445, + "step": 5900 + }, + { + "epoch": 6.588628762541806, + "grad_norm": 1.3093374967575073, + "learning_rate": 0.0002, + "loss": 1.169, + "step": 5910 + }, + { + "epoch": 6.599777034559644, + "grad_norm": 1.0436453819274902, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 5920 + }, + { + "epoch": 6.610925306577481, + "grad_norm": 1.064468502998352, + "learning_rate": 0.0002, + "loss": 1.1358, + "step": 5930 + }, + { + "epoch": 6.622073578595318, + "grad_norm": 1.2561777830123901, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 5940 + }, + { + "epoch": 6.633221850613155, + "grad_norm": 1.2759621143341064, + "learning_rate": 0.0002, + "loss": 1.1088, + "step": 5950 + }, + { + "epoch": 6.644370122630992, + "grad_norm": 1.0602868795394897, + "learning_rate": 0.0002, + "loss": 1.1103, + "step": 5960 + }, + { + "epoch": 6.65551839464883, + "grad_norm": 1.2336751222610474, + "learning_rate": 0.0002, + "loss": 1.2081, + "step": 5970 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.1773011684417725, + "learning_rate": 0.0002, + "loss": 1.1264, + "step": 5980 + }, + { + "epoch": 6.677814938684504, + "grad_norm": 1.0779681205749512, + "learning_rate": 0.0002, + "loss": 1.1641, + "step": 5990 + }, + { + "epoch": 6.688963210702341, + "grad_norm": 1.396223783493042, + "learning_rate": 0.0002, + "loss": 1.1034, + "step": 6000 + }, + { + "epoch": 6.700111482720178, + "grad_norm": 1.2238768339157104, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 6010 + }, + { + "epoch": 6.711259754738015, + "grad_norm": 1.1152666807174683, + "learning_rate": 0.0002, + "loss": 1.098, + "step": 6020 + }, + { + "epoch": 6.722408026755852, + "grad_norm": 1.2376031875610352, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 6030 + }, + { + "epoch": 6.73355629877369, + "grad_norm": 1.0868488550186157, + "learning_rate": 0.0002, + "loss": 1.1062, + "step": 6040 + }, + { + "epoch": 6.744704570791527, + "grad_norm": 1.265913724899292, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 6050 + }, + { + "epoch": 6.7558528428093645, + "grad_norm": 1.1551072597503662, + "learning_rate": 0.0002, + "loss": 1.0959, + "step": 6060 + }, + { + "epoch": 6.767001114827202, + "grad_norm": 1.0813109874725342, + "learning_rate": 0.0002, + "loss": 1.1395, + "step": 6070 + }, + { + "epoch": 6.778149386845039, + "grad_norm": 1.2367933988571167, + "learning_rate": 0.0002, + "loss": 1.1047, + "step": 6080 + }, + { + "epoch": 6.789297658862877, + "grad_norm": 1.1612437963485718, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 6090 + }, + { + "epoch": 6.800445930880714, + "grad_norm": 1.2715837955474854, + "learning_rate": 0.0002, + "loss": 1.1462, + "step": 6100 + }, + { + "epoch": 6.811594202898551, + "grad_norm": 1.1385036706924438, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 6110 + }, + { + "epoch": 6.822742474916388, + "grad_norm": 1.4322341680526733, + "learning_rate": 0.0002, + "loss": 1.137, + "step": 6120 + }, + { + "epoch": 6.833890746934225, + "grad_norm": 1.2975877523422241, + "learning_rate": 0.0002, + "loss": 1.1571, + "step": 6130 + }, + { + "epoch": 6.845039018952063, + "grad_norm": 1.0241044759750366, + "learning_rate": 0.0002, + "loss": 1.1592, + "step": 6140 + }, + { + "epoch": 6.8561872909699, + "grad_norm": 1.352594017982483, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 6150 + }, + { + "epoch": 6.867335562987737, + "grad_norm": 1.1166167259216309, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 6160 + }, + { + "epoch": 6.878483835005574, + "grad_norm": 1.1596941947937012, + "learning_rate": 0.0002, + "loss": 1.1409, + "step": 6170 + }, + { + "epoch": 6.889632107023411, + "grad_norm": 1.5753912925720215, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 6180 + }, + { + "epoch": 6.900780379041248, + "grad_norm": 1.1857494115829468, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 6190 + }, + { + "epoch": 6.911928651059085, + "grad_norm": 1.1507896184921265, + "learning_rate": 0.0002, + "loss": 1.137, + "step": 6200 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 1.5194647312164307, + "learning_rate": 0.0002, + "loss": 1.1532, + "step": 6210 + }, + { + "epoch": 6.93422519509476, + "grad_norm": 1.1627732515335083, + "learning_rate": 0.0002, + "loss": 1.1315, + "step": 6220 + }, + { + "epoch": 6.9453734671125975, + "grad_norm": 1.1929609775543213, + "learning_rate": 0.0002, + "loss": 1.1079, + "step": 6230 + }, + { + "epoch": 6.956521739130435, + "grad_norm": 1.2704664468765259, + "learning_rate": 0.0002, + "loss": 1.1331, + "step": 6240 + }, + { + "epoch": 6.967670011148272, + "grad_norm": 1.1791198253631592, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 6250 + }, + { + "epoch": 6.97881828316611, + "grad_norm": 1.1948790550231934, + "learning_rate": 0.0002, + "loss": 1.1152, + "step": 6260 + }, + { + "epoch": 6.989966555183947, + "grad_norm": 1.222116231918335, + "learning_rate": 0.0002, + "loss": 1.1213, + "step": 6270 + }, + { + "epoch": 7.0, + "eval_loss": 2.174532890319824, + "eval_runtime": 38.0962, + "eval_samples_per_second": 13.518, + "eval_steps_per_second": 1.706, + "step": 6279 + }, + { + "epoch": 7.001114827201784, + "grad_norm": 1.0389306545257568, + "learning_rate": 0.0002, + "loss": 1.1558, + "step": 6280 + }, + { + "epoch": 7.012263099219621, + "grad_norm": 1.5281798839569092, + "learning_rate": 0.0002, + "loss": 0.9833, + "step": 6290 + }, + { + "epoch": 7.023411371237458, + "grad_norm": 1.097888708114624, + "learning_rate": 0.0002, + "loss": 0.9557, + "step": 6300 + }, + { + "epoch": 7.034559643255295, + "grad_norm": 1.4041006565093994, + "learning_rate": 0.0002, + "loss": 0.9435, + "step": 6310 + }, + { + "epoch": 7.045707915273133, + "grad_norm": 1.3070768117904663, + "learning_rate": 0.0002, + "loss": 0.9183, + "step": 6320 + }, + { + "epoch": 7.05685618729097, + "grad_norm": 1.5640852451324463, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 6330 + }, + { + "epoch": 7.068004459308807, + "grad_norm": 1.5929399728775024, + "learning_rate": 0.0002, + "loss": 0.874, + "step": 6340 + }, + { + "epoch": 7.079152731326644, + "grad_norm": 1.2621946334838867, + "learning_rate": 0.0002, + "loss": 0.8461, + "step": 6350 + }, + { + "epoch": 7.090301003344481, + "grad_norm": 1.9438022375106812, + "learning_rate": 0.0002, + "loss": 0.9601, + "step": 6360 + }, + { + "epoch": 7.101449275362318, + "grad_norm": 1.3711209297180176, + "learning_rate": 0.0002, + "loss": 0.9348, + "step": 6370 + }, + { + "epoch": 7.112597547380156, + "grad_norm": 1.2935353517532349, + "learning_rate": 0.0002, + "loss": 0.9318, + "step": 6380 + }, + { + "epoch": 7.1237458193979935, + "grad_norm": 1.4326812028884888, + "learning_rate": 0.0002, + "loss": 0.9687, + "step": 6390 + }, + { + "epoch": 7.1348940914158305, + "grad_norm": 1.604068398475647, + "learning_rate": 0.0002, + "loss": 0.9552, + "step": 6400 + }, + { + "epoch": 7.146042363433668, + "grad_norm": 1.5581567287445068, + "learning_rate": 0.0002, + "loss": 0.9692, + "step": 6410 + }, + { + "epoch": 7.157190635451505, + "grad_norm": 1.3148343563079834, + "learning_rate": 0.0002, + "loss": 0.9209, + "step": 6420 + }, + { + "epoch": 7.168338907469343, + "grad_norm": 1.3319238424301147, + "learning_rate": 0.0002, + "loss": 0.9401, + "step": 6430 + }, + { + "epoch": 7.17948717948718, + "grad_norm": 1.3741648197174072, + "learning_rate": 0.0002, + "loss": 0.9306, + "step": 6440 + }, + { + "epoch": 7.190635451505017, + "grad_norm": 1.2071956396102905, + "learning_rate": 0.0002, + "loss": 0.9681, + "step": 6450 + }, + { + "epoch": 7.201783723522854, + "grad_norm": 1.4183731079101562, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 6460 + }, + { + "epoch": 7.212931995540691, + "grad_norm": 1.4467699527740479, + "learning_rate": 0.0002, + "loss": 0.9611, + "step": 6470 + }, + { + "epoch": 7.224080267558528, + "grad_norm": 1.3801071643829346, + "learning_rate": 0.0002, + "loss": 0.9784, + "step": 6480 + }, + { + "epoch": 7.235228539576366, + "grad_norm": 1.6222909688949585, + "learning_rate": 0.0002, + "loss": 0.9463, + "step": 6490 + }, + { + "epoch": 7.246376811594203, + "grad_norm": 1.6431424617767334, + "learning_rate": 0.0002, + "loss": 0.9701, + "step": 6500 + }, + { + "epoch": 7.25752508361204, + "grad_norm": 1.4911304712295532, + "learning_rate": 0.0002, + "loss": 0.937, + "step": 6510 + }, + { + "epoch": 7.268673355629877, + "grad_norm": 1.3448628187179565, + "learning_rate": 0.0002, + "loss": 0.933, + "step": 6520 + }, + { + "epoch": 7.279821627647714, + "grad_norm": 1.2078956365585327, + "learning_rate": 0.0002, + "loss": 0.9399, + "step": 6530 + }, + { + "epoch": 7.290969899665551, + "grad_norm": 1.6037310361862183, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 6540 + }, + { + "epoch": 7.302118171683389, + "grad_norm": 1.541955828666687, + "learning_rate": 0.0002, + "loss": 0.9763, + "step": 6550 + }, + { + "epoch": 7.3132664437012265, + "grad_norm": 1.5351279973983765, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 6560 + }, + { + "epoch": 7.3244147157190636, + "grad_norm": 1.4032648801803589, + "learning_rate": 0.0002, + "loss": 0.9742, + "step": 6570 + }, + { + "epoch": 7.335562987736901, + "grad_norm": 1.1339422464370728, + "learning_rate": 0.0002, + "loss": 0.9687, + "step": 6580 + }, + { + "epoch": 7.346711259754738, + "grad_norm": 1.2702211141586304, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 6590 + }, + { + "epoch": 7.357859531772576, + "grad_norm": 1.2987596988677979, + "learning_rate": 0.0002, + "loss": 0.9823, + "step": 6600 + }, + { + "epoch": 7.369007803790413, + "grad_norm": 1.506354808807373, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 6610 + }, + { + "epoch": 7.38015607580825, + "grad_norm": 1.2649177312850952, + "learning_rate": 0.0002, + "loss": 0.979, + "step": 6620 + }, + { + "epoch": 7.391304347826087, + "grad_norm": 1.4871227741241455, + "learning_rate": 0.0002, + "loss": 0.9905, + "step": 6630 + }, + { + "epoch": 7.402452619843924, + "grad_norm": 1.6173475980758667, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 6640 + }, + { + "epoch": 7.413600891861761, + "grad_norm": 1.2726142406463623, + "learning_rate": 0.0002, + "loss": 0.9615, + "step": 6650 + }, + { + "epoch": 7.424749163879599, + "grad_norm": 1.4965415000915527, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 6660 + }, + { + "epoch": 7.435897435897436, + "grad_norm": 1.4861866235733032, + "learning_rate": 0.0002, + "loss": 0.9776, + "step": 6670 + }, + { + "epoch": 7.447045707915273, + "grad_norm": 1.6286227703094482, + "learning_rate": 0.0002, + "loss": 0.9861, + "step": 6680 + }, + { + "epoch": 7.45819397993311, + "grad_norm": 1.5688917636871338, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 6690 + }, + { + "epoch": 7.469342251950947, + "grad_norm": 1.2886908054351807, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 6700 + }, + { + "epoch": 7.4804905239687844, + "grad_norm": 1.5951329469680786, + "learning_rate": 0.0002, + "loss": 0.9773, + "step": 6710 + }, + { + "epoch": 7.491638795986622, + "grad_norm": 1.4492952823638916, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 6720 + }, + { + "epoch": 7.5027870680044595, + "grad_norm": 1.6316872835159302, + "learning_rate": 0.0002, + "loss": 1.0378, + "step": 6730 + }, + { + "epoch": 7.513935340022297, + "grad_norm": 1.471291422843933, + "learning_rate": 0.0002, + "loss": 0.9678, + "step": 6740 + }, + { + "epoch": 7.525083612040134, + "grad_norm": 1.5187207460403442, + "learning_rate": 0.0002, + "loss": 0.9368, + "step": 6750 + }, + { + "epoch": 7.536231884057971, + "grad_norm": 1.5191140174865723, + "learning_rate": 0.0002, + "loss": 1.0068, + "step": 6760 + }, + { + "epoch": 7.547380156075809, + "grad_norm": 1.402166485786438, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 6770 + }, + { + "epoch": 7.558528428093646, + "grad_norm": 1.4154515266418457, + "learning_rate": 0.0002, + "loss": 0.9712, + "step": 6780 + }, + { + "epoch": 7.569676700111483, + "grad_norm": 1.530374526977539, + "learning_rate": 0.0002, + "loss": 0.9181, + "step": 6790 + }, + { + "epoch": 7.58082497212932, + "grad_norm": 1.335096836090088, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 6800 + }, + { + "epoch": 7.591973244147157, + "grad_norm": 1.5730568170547485, + "learning_rate": 0.0002, + "loss": 0.922, + "step": 6810 + }, + { + "epoch": 7.603121516164994, + "grad_norm": 1.4692550897598267, + "learning_rate": 0.0002, + "loss": 0.9806, + "step": 6820 + }, + { + "epoch": 7.614269788182831, + "grad_norm": 1.3645410537719727, + "learning_rate": 0.0002, + "loss": 0.9719, + "step": 6830 + }, + { + "epoch": 7.625418060200669, + "grad_norm": 1.5139234066009521, + "learning_rate": 0.0002, + "loss": 1.0284, + "step": 6840 + }, + { + "epoch": 7.636566332218506, + "grad_norm": 1.4001535177230835, + "learning_rate": 0.0002, + "loss": 1.007, + "step": 6850 + }, + { + "epoch": 7.647714604236343, + "grad_norm": 1.5518683195114136, + "learning_rate": 0.0002, + "loss": 1.0315, + "step": 6860 + }, + { + "epoch": 7.65886287625418, + "grad_norm": 1.6151013374328613, + "learning_rate": 0.0002, + "loss": 1.0058, + "step": 6870 + }, + { + "epoch": 7.6700111482720175, + "grad_norm": 1.5577940940856934, + "learning_rate": 0.0002, + "loss": 0.9789, + "step": 6880 + }, + { + "epoch": 7.681159420289855, + "grad_norm": 1.2788935899734497, + "learning_rate": 0.0002, + "loss": 0.9728, + "step": 6890 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 1.3274600505828857, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 6900 + }, + { + "epoch": 7.70345596432553, + "grad_norm": 1.3590648174285889, + "learning_rate": 0.0002, + "loss": 0.9739, + "step": 6910 + }, + { + "epoch": 7.714604236343367, + "grad_norm": 1.4309452772140503, + "learning_rate": 0.0002, + "loss": 0.9639, + "step": 6920 + }, + { + "epoch": 7.725752508361204, + "grad_norm": 1.3435392379760742, + "learning_rate": 0.0002, + "loss": 0.9725, + "step": 6930 + }, + { + "epoch": 7.736900780379042, + "grad_norm": 1.519593358039856, + "learning_rate": 0.0002, + "loss": 1.0013, + "step": 6940 + }, + { + "epoch": 7.748049052396879, + "grad_norm": 1.1542080640792847, + "learning_rate": 0.0002, + "loss": 0.9149, + "step": 6950 + }, + { + "epoch": 7.759197324414716, + "grad_norm": 1.3358652591705322, + "learning_rate": 0.0002, + "loss": 1.055, + "step": 6960 + }, + { + "epoch": 7.770345596432553, + "grad_norm": 1.526912808418274, + "learning_rate": 0.0002, + "loss": 0.9777, + "step": 6970 + }, + { + "epoch": 7.78149386845039, + "grad_norm": 1.303989052772522, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 6980 + }, + { + "epoch": 7.792642140468227, + "grad_norm": 1.3185025453567505, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 6990 + }, + { + "epoch": 7.803790412486064, + "grad_norm": 1.3556475639343262, + "learning_rate": 0.0002, + "loss": 1.0294, + "step": 7000 + }, + { + "epoch": 7.814938684503902, + "grad_norm": 1.3264387845993042, + "learning_rate": 0.0002, + "loss": 1.0184, + "step": 7010 + }, + { + "epoch": 7.826086956521739, + "grad_norm": 1.4610573053359985, + "learning_rate": 0.0002, + "loss": 0.9507, + "step": 7020 + }, + { + "epoch": 7.837235228539576, + "grad_norm": 1.39540433883667, + "learning_rate": 0.0002, + "loss": 0.9847, + "step": 7030 + }, + { + "epoch": 7.848383500557413, + "grad_norm": 1.5537383556365967, + "learning_rate": 0.0002, + "loss": 1.0302, + "step": 7040 + }, + { + "epoch": 7.8595317725752505, + "grad_norm": 1.6064108610153198, + "learning_rate": 0.0002, + "loss": 0.9941, + "step": 7050 + }, + { + "epoch": 7.8706800445930885, + "grad_norm": 1.4497601985931396, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 7060 + }, + { + "epoch": 7.8818283166109255, + "grad_norm": 1.3896540403366089, + "learning_rate": 0.0002, + "loss": 1.0416, + "step": 7070 + }, + { + "epoch": 7.892976588628763, + "grad_norm": 1.4320734739303589, + "learning_rate": 0.0002, + "loss": 0.9959, + "step": 7080 + }, + { + "epoch": 7.9041248606466, + "grad_norm": 1.3116543292999268, + "learning_rate": 0.0002, + "loss": 1.0181, + "step": 7090 + }, + { + "epoch": 7.915273132664437, + "grad_norm": 1.290254831314087, + "learning_rate": 0.0002, + "loss": 1.0162, + "step": 7100 + }, + { + "epoch": 7.926421404682275, + "grad_norm": 1.4764007329940796, + "learning_rate": 0.0002, + "loss": 1.0486, + "step": 7110 + }, + { + "epoch": 7.937569676700112, + "grad_norm": 1.4759361743927002, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 7120 + }, + { + "epoch": 7.948717948717949, + "grad_norm": 1.4465186595916748, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 7130 + }, + { + "epoch": 7.959866220735786, + "grad_norm": 1.333365797996521, + "learning_rate": 0.0002, + "loss": 0.9883, + "step": 7140 + }, + { + "epoch": 7.971014492753623, + "grad_norm": 1.5393798351287842, + "learning_rate": 0.0002, + "loss": 0.9918, + "step": 7150 + }, + { + "epoch": 7.98216276477146, + "grad_norm": 1.3893442153930664, + "learning_rate": 0.0002, + "loss": 1.0166, + "step": 7160 + }, + { + "epoch": 7.993311036789297, + "grad_norm": 1.4354097843170166, + "learning_rate": 0.0002, + "loss": 1.052, + "step": 7170 + }, + { + "epoch": 8.0, + "eval_loss": 2.338440418243408, + "eval_runtime": 38.0319, + "eval_samples_per_second": 13.541, + "eval_steps_per_second": 1.709, + "step": 7176 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.320892122852229e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-7176/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ea53ebb86c72461c11f758f00bf424fedd97760 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bea192d81b9d4aafeb75b6b5d65d07199ee6779591868790e583ce5f06ee53e4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f650de9c0be78b9b779ea39b638a103154a0651d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35371773ddb9337f1979f38418874577db6e6ff8c1a5af6bc6abbffe351e8b8d +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..182e054d5b7c0baffba0eb71b070d2c1c0be3ef6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a26332468f4bb0368241f80d1b0f6e8b76ce047fa47c65c7c52f652ec0291296 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f6c49ff8fc079811cb5662793c8917ec8403621 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa2ae9907c7702d2c35e5b90bb41a1a507933e14676d7f04127cd902bb4770a9 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..157c1b3f10fd5c166cb3ea7e90bb4bfde20dc179 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/trainer_state.json @@ -0,0 +1,664 @@ +{ + "best_metric": 1.8143481016159058, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 897, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011148272017837236, + "grad_norm": 0.4864582419395447, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 10 + }, + { + "epoch": 0.022296544035674472, + "grad_norm": 0.6151555776596069, + "learning_rate": 0.0002, + "loss": 2.2959, + "step": 20 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 0.541170060634613, + "learning_rate": 0.0002, + "loss": 2.008, + "step": 30 + }, + { + "epoch": 0.044593088071348944, + "grad_norm": 0.4160577058792114, + "learning_rate": 0.0002, + "loss": 1.9404, + "step": 40 + }, + { + "epoch": 0.055741360089186176, + "grad_norm": 0.5151045918464661, + "learning_rate": 0.0002, + "loss": 1.9695, + "step": 50 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 0.4899227023124695, + "learning_rate": 0.0002, + "loss": 1.9375, + "step": 60 + }, + { + "epoch": 0.07803790412486064, + "grad_norm": 0.6387737393379211, + "learning_rate": 0.0002, + "loss": 1.8537, + "step": 70 + }, + { + "epoch": 0.08918617614269789, + "grad_norm": 0.44113653898239136, + "learning_rate": 0.0002, + "loss": 1.8591, + "step": 80 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 0.4688360393047333, + "learning_rate": 0.0002, + "loss": 1.9253, + "step": 90 + }, + { + "epoch": 0.11148272017837235, + "grad_norm": 0.44789502024650574, + "learning_rate": 0.0002, + "loss": 1.9809, + "step": 100 + }, + { + "epoch": 0.12263099219620958, + "grad_norm": 0.4484880864620209, + "learning_rate": 0.0002, + "loss": 1.8297, + "step": 110 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 0.46527230739593506, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 120 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 0.5095470547676086, + "learning_rate": 0.0002, + "loss": 1.8941, + "step": 130 + }, + { + "epoch": 0.15607580824972128, + "grad_norm": 0.4180101752281189, + "learning_rate": 0.0002, + "loss": 1.8936, + "step": 140 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 0.45976975560188293, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 150 + }, + { + "epoch": 0.17837235228539577, + "grad_norm": 0.43929311633110046, + "learning_rate": 0.0002, + "loss": 1.8996, + "step": 160 + }, + { + "epoch": 0.189520624303233, + "grad_norm": 0.43384963274002075, + "learning_rate": 0.0002, + "loss": 1.828, + "step": 170 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 0.4810775816440582, + "learning_rate": 0.0002, + "loss": 1.8599, + "step": 180 + }, + { + "epoch": 0.21181716833890746, + "grad_norm": 0.4231500029563904, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.2229654403567447, + "grad_norm": 0.40217751264572144, + "learning_rate": 0.0002, + "loss": 1.8029, + "step": 200 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.3772163689136505, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 210 + }, + { + "epoch": 0.24526198439241917, + "grad_norm": 0.3765389621257782, + "learning_rate": 0.0002, + "loss": 1.8709, + "step": 220 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.3947426378726959, + "learning_rate": 0.0002, + "loss": 1.8571, + "step": 230 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.38083791732788086, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 240 + }, + { + "epoch": 0.2787068004459309, + "grad_norm": 0.6683781743049622, + "learning_rate": 0.0002, + "loss": 1.7449, + "step": 250 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 0.41476085782051086, + "learning_rate": 0.0002, + "loss": 1.787, + "step": 260 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 0.3722982704639435, + "learning_rate": 0.0002, + "loss": 1.8212, + "step": 270 + }, + { + "epoch": 0.31215161649944256, + "grad_norm": 0.4132225811481476, + "learning_rate": 0.0002, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.3232998885172798, + "grad_norm": 0.41937923431396484, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 290 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 0.3839682340621948, + "learning_rate": 0.0002, + "loss": 1.9065, + "step": 300 + }, + { + "epoch": 0.3455964325529543, + "grad_norm": 0.33736854791641235, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 310 + }, + { + "epoch": 0.35674470457079155, + "grad_norm": 0.4552125334739685, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 320 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 0.3592551350593567, + "learning_rate": 0.0002, + "loss": 1.8141, + "step": 330 + }, + { + "epoch": 0.379041248606466, + "grad_norm": 0.3872784972190857, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 340 + }, + { + "epoch": 0.39018952062430323, + "grad_norm": 0.35498011112213135, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 350 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 0.3489432632923126, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 360 + }, + { + "epoch": 0.4124860646599777, + "grad_norm": 0.3511202037334442, + "learning_rate": 0.0002, + "loss": 1.8374, + "step": 370 + }, + { + "epoch": 0.4236343366778149, + "grad_norm": 0.3891856074333191, + "learning_rate": 0.0002, + "loss": 1.7845, + "step": 380 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.4112119972705841, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 390 + }, + { + "epoch": 0.4459308807134894, + "grad_norm": 0.3329351246356964, + "learning_rate": 0.0002, + "loss": 1.7746, + "step": 400 + }, + { + "epoch": 0.45707915273132665, + "grad_norm": 0.32010194659233093, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 410 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 0.3335704505443573, + "learning_rate": 0.0002, + "loss": 1.8266, + "step": 420 + }, + { + "epoch": 0.4793756967670011, + "grad_norm": 0.3508165180683136, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 430 + }, + { + "epoch": 0.49052396878483834, + "grad_norm": 0.3818604052066803, + "learning_rate": 0.0002, + "loss": 1.8241, + "step": 440 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 0.37044021487236023, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 450 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 0.3258146047592163, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 460 + }, + { + "epoch": 0.5239687848383501, + "grad_norm": 0.3390968143939972, + "learning_rate": 0.0002, + "loss": 1.8662, + "step": 470 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 0.41194117069244385, + "learning_rate": 0.0002, + "loss": 1.8545, + "step": 480 + }, + { + "epoch": 0.5462653288740246, + "grad_norm": 0.34630897641181946, + "learning_rate": 0.0002, + "loss": 1.8727, + "step": 490 + }, + { + "epoch": 0.5574136008918618, + "grad_norm": 0.28459733724594116, + "learning_rate": 0.0002, + "loss": 1.7747, + "step": 500 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 0.33051759004592896, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 510 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.37259650230407715, + "learning_rate": 0.0002, + "loss": 1.8997, + "step": 520 + }, + { + "epoch": 0.5908584169453734, + "grad_norm": 0.4604213833808899, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 530 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 0.3107241988182068, + "learning_rate": 0.0002, + "loss": 1.7226, + "step": 540 + }, + { + "epoch": 0.6131549609810479, + "grad_norm": 0.34454235434532166, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 550 + }, + { + "epoch": 0.6243032329988851, + "grad_norm": 0.32745128870010376, + "learning_rate": 0.0002, + "loss": 1.8061, + "step": 560 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 0.32668930292129517, + "learning_rate": 0.0002, + "loss": 1.8565, + "step": 570 + }, + { + "epoch": 0.6465997770345596, + "grad_norm": 0.31747013330459595, + "learning_rate": 0.0002, + "loss": 1.7705, + "step": 580 + }, + { + "epoch": 0.6577480490523969, + "grad_norm": 0.3399045169353485, + "learning_rate": 0.0002, + "loss": 1.7835, + "step": 590 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.40407994389533997, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 600 + }, + { + "epoch": 0.6800445930880713, + "grad_norm": 0.3739639222621918, + "learning_rate": 0.0002, + "loss": 1.8037, + "step": 610 + }, + { + "epoch": 0.6911928651059086, + "grad_norm": 0.3739263713359833, + "learning_rate": 0.0002, + "loss": 1.8654, + "step": 620 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.3418176770210266, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 630 + }, + { + "epoch": 0.7134894091415831, + "grad_norm": 0.3314031660556793, + "learning_rate": 0.0002, + "loss": 1.8081, + "step": 640 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 0.3569042384624481, + "learning_rate": 0.0002, + "loss": 1.7452, + "step": 650 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.4068199098110199, + "learning_rate": 0.0002, + "loss": 1.8655, + "step": 660 + }, + { + "epoch": 0.7469342251950948, + "grad_norm": 0.385543555021286, + "learning_rate": 0.0002, + "loss": 1.748, + "step": 670 + }, + { + "epoch": 0.758082497212932, + "grad_norm": 0.3103431165218353, + "learning_rate": 0.0002, + "loss": 1.8055, + "step": 680 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.32295092940330505, + "learning_rate": 0.0002, + "loss": 1.7255, + "step": 690 + }, + { + "epoch": 0.7803790412486065, + "grad_norm": 0.38221824169158936, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 700 + }, + { + "epoch": 0.7915273132664437, + "grad_norm": 0.3228561282157898, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 710 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 0.32148292660713196, + "learning_rate": 0.0002, + "loss": 1.8552, + "step": 720 + }, + { + "epoch": 0.8138238573021181, + "grad_norm": 0.3125041723251343, + "learning_rate": 0.0002, + "loss": 1.823, + "step": 730 + }, + { + "epoch": 0.8249721293199554, + "grad_norm": 0.43717217445373535, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 740 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.32372939586639404, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 750 + }, + { + "epoch": 0.8472686733556298, + "grad_norm": 0.3270736336708069, + "learning_rate": 0.0002, + "loss": 1.7855, + "step": 760 + }, + { + "epoch": 0.8584169453734671, + "grad_norm": 0.32658815383911133, + "learning_rate": 0.0002, + "loss": 1.8283, + "step": 770 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.3742631673812866, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 780 + }, + { + "epoch": 0.8807134894091416, + "grad_norm": 0.3322608172893524, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 790 + }, + { + "epoch": 0.8918617614269788, + "grad_norm": 0.441494882106781, + "learning_rate": 0.0002, + "loss": 1.7984, + "step": 800 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 0.38793420791625977, + "learning_rate": 0.0002, + "loss": 1.8352, + "step": 810 + }, + { + "epoch": 0.9141583054626533, + "grad_norm": 0.4095474183559418, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 820 + }, + { + "epoch": 0.9253065774804905, + "grad_norm": 0.36847662925720215, + "learning_rate": 0.0002, + "loss": 1.7837, + "step": 830 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 0.28806909918785095, + "learning_rate": 0.0002, + "loss": 1.7867, + "step": 840 + }, + { + "epoch": 0.947603121516165, + "grad_norm": 0.3261156976222992, + "learning_rate": 0.0002, + "loss": 1.848, + "step": 850 + }, + { + "epoch": 0.9587513935340022, + "grad_norm": 0.4674798250198364, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 860 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.30819064378738403, + "learning_rate": 0.0002, + "loss": 1.7742, + "step": 870 + }, + { + "epoch": 0.9810479375696767, + "grad_norm": 0.32203033566474915, + "learning_rate": 0.0002, + "loss": 1.8184, + "step": 880 + }, + { + "epoch": 0.992196209587514, + "grad_norm": 0.3409714102745056, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 890 + }, + { + "epoch": 1.0, + "eval_loss": 1.8143481016159058, + "eval_runtime": 37.921, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 897 + } + ], + "logging_steps": 10, + "max_steps": 7176, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.151115153565286e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..95338fad5207d5443dc0365c8c2248fc7e5ee897 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3599a019be490123de30c242ae69005d5b9650ce503103f1bf42e7f3cead11d3 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..9a201669613fbd49ad73694c741c4623e5f0b6f5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 897, "epoch_duration": 966.433441400528, "total_accumulated_duration": 966.433441400528, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}]} +{"epoch": 2.0, "step": 1794, "epoch_duration": 974.5328199863434, "total_accumulated_duration": 1940.9662613868713, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-897", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}, {"eval_loss": 1.8143481016159058, "eval_runtime": 37.921, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 1.0, "step": 897}, {"loss": 1.8029, "grad_norm": 0.29757317900657654, "learning_rate": 0.0002, "epoch": 1.0033444816053512, "step": 900}, {"loss": 1.7376, "grad_norm": 0.32168492674827576, "learning_rate": 0.0002, "epoch": 1.0144927536231885, "step": 910}, {"loss": 1.6785, "grad_norm": 0.3430717885494232, "learning_rate": 0.0002, "epoch": 1.0256410256410255, "step": 920}, {"loss": 1.7356, "grad_norm": 0.3431745767593384, "learning_rate": 0.0002, "epoch": 1.0367892976588629, "step": 930}, {"loss": 1.7932, "grad_norm": 0.39787548780441284, "learning_rate": 0.0002, "epoch": 1.0479375696767002, "step": 940}, {"loss": 1.7434, "grad_norm": 0.3540935218334198, "learning_rate": 0.0002, "epoch": 1.0590858416945372, "step": 950}, {"loss": 1.7693, "grad_norm": 0.368484765291214, "learning_rate": 0.0002, "epoch": 1.0702341137123745, "step": 960}, {"loss": 1.6887, "grad_norm": 0.41324466466903687, "learning_rate": 0.0002, "epoch": 1.0813823857302118, "step": 970}, {"loss": 1.7288, "grad_norm": 0.3696419596672058, "learning_rate": 0.0002, "epoch": 1.0925306577480491, "step": 980}, {"loss": 1.7743, "grad_norm": 0.33832886815071106, "learning_rate": 0.0002, "epoch": 1.1036789297658862, "step": 990}, {"loss": 1.7445, "grad_norm": 0.4411991834640503, "learning_rate": 0.0002, "epoch": 1.1148272017837235, "step": 1000}, {"loss": 1.7699, "grad_norm": 0.3935333788394928, "learning_rate": 0.0002, "epoch": 1.1259754738015608, "step": 1010}, {"loss": 1.6909, "grad_norm": 0.32472893595695496, "learning_rate": 0.0002, "epoch": 1.137123745819398, "step": 1020}, {"loss": 1.6974, "grad_norm": 0.3455545902252197, "learning_rate": 0.0002, "epoch": 1.1482720178372352, "step": 1030}, {"loss": 1.7555, "grad_norm": 0.3995654582977295, "learning_rate": 0.0002, "epoch": 1.1594202898550725, "step": 1040}, {"loss": 1.7419, "grad_norm": 0.384056031703949, "learning_rate": 0.0002, "epoch": 1.1705685618729098, "step": 1050}, {"loss": 1.7693, "grad_norm": 0.4345705211162567, "learning_rate": 0.0002, "epoch": 1.1817168338907469, "step": 1060}, {"loss": 1.7219, "grad_norm": 0.3524057865142822, "learning_rate": 0.0002, "epoch": 1.1928651059085842, "step": 1070}, {"loss": 1.6701, "grad_norm": 0.4047132134437561, "learning_rate": 0.0002, "epoch": 1.2040133779264215, "step": 1080}, {"loss": 1.7035, "grad_norm": 0.365824431180954, "learning_rate": 0.0002, "epoch": 1.2151616499442586, "step": 1090}, {"loss": 1.7367, "grad_norm": 0.37048354744911194, "learning_rate": 0.0002, "epoch": 1.2263099219620959, "step": 1100}, {"loss": 1.7503, "grad_norm": 0.3753672242164612, "learning_rate": 0.0002, "epoch": 1.2374581939799332, "step": 1110}, {"loss": 1.6984, "grad_norm": 0.37887042760849, "learning_rate": 0.0002, "epoch": 1.2486064659977703, "step": 1120}, {"loss": 1.7866, "grad_norm": 0.3896579444408417, "learning_rate": 0.0002, "epoch": 1.2597547380156076, "step": 1130}, {"loss": 1.8085, "grad_norm": 0.3725394010543823, "learning_rate": 0.0002, "epoch": 1.2709030100334449, "step": 1140}, {"loss": 1.6942, "grad_norm": 0.373989999294281, "learning_rate": 0.0002, "epoch": 1.282051282051282, "step": 1150}, {"loss": 1.7566, "grad_norm": 0.4412260353565216, "learning_rate": 0.0002, "epoch": 1.2931995540691192, "step": 1160}, {"loss": 1.7425, "grad_norm": 0.38538658618927, "learning_rate": 0.0002, "epoch": 1.3043478260869565, "step": 1170}, {"loss": 1.6573, "grad_norm": 0.3644104599952698, "learning_rate": 0.0002, "epoch": 1.3154960981047936, "step": 1180}, {"loss": 1.6186, "grad_norm": 0.3615347743034363, "learning_rate": 0.0002, "epoch": 1.326644370122631, "step": 1190}, {"loss": 1.7575, "grad_norm": 0.4260489046573639, "learning_rate": 0.0002, "epoch": 1.3377926421404682, "step": 1200}, {"loss": 1.762, "grad_norm": 0.35236871242523193, "learning_rate": 0.0002, "epoch": 1.3489409141583055, "step": 1210}, {"loss": 1.7207, "grad_norm": 0.45456627011299133, "learning_rate": 0.0002, "epoch": 1.3600891861761428, "step": 1220}, {"loss": 1.7391, "grad_norm": 0.391541063785553, "learning_rate": 0.0002, "epoch": 1.37123745819398, "step": 1230}, {"loss": 1.7309, "grad_norm": 0.37955328822135925, "learning_rate": 0.0002, "epoch": 1.3823857302118172, "step": 1240}, {"loss": 1.7028, "grad_norm": 0.36955225467681885, "learning_rate": 0.0002, "epoch": 1.3935340022296545, "step": 1250}, {"loss": 1.7027, "grad_norm": 0.36156216263771057, "learning_rate": 0.0002, "epoch": 1.4046822742474916, "step": 1260}, {"loss": 1.8091, "grad_norm": 0.4083487391471863, "learning_rate": 0.0002, "epoch": 1.415830546265329, "step": 1270}, {"loss": 1.7551, "grad_norm": 0.420171320438385, "learning_rate": 0.0002, "epoch": 1.4269788182831662, "step": 1280}, {"loss": 1.7377, "grad_norm": 0.3581725060939789, "learning_rate": 0.0002, "epoch": 1.4381270903010033, "step": 1290}, {"loss": 1.728, "grad_norm": 0.3657953441143036, "learning_rate": 0.0002, "epoch": 1.4492753623188406, "step": 1300}, {"loss": 1.7116, "grad_norm": 0.3139931857585907, "learning_rate": 0.0002, "epoch": 1.4604236343366779, "step": 1310}, {"loss": 1.671, "grad_norm": 0.37750574946403503, "learning_rate": 0.0002, "epoch": 1.471571906354515, "step": 1320}, {"loss": 1.7663, "grad_norm": 0.37787437438964844, "learning_rate": 0.0002, "epoch": 1.4827201783723523, "step": 1330}, {"loss": 1.6403, "grad_norm": 0.39505279064178467, "learning_rate": 0.0002, "epoch": 1.4938684503901896, "step": 1340}, {"loss": 1.7745, "grad_norm": 0.39977672696113586, "learning_rate": 0.0002, "epoch": 1.5050167224080266, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.4395383298397064, "learning_rate": 0.0002, "epoch": 1.516164994425864, "step": 1360}, {"loss": 1.7315, "grad_norm": 0.3452998995780945, "learning_rate": 0.0002, "epoch": 1.5273132664437012, "step": 1370}, {"loss": 1.7244, "grad_norm": 0.39573904871940613, "learning_rate": 0.0002, "epoch": 1.5384615384615383, "step": 1380}, {"loss": 1.7453, "grad_norm": 0.4886358976364136, "learning_rate": 0.0002, "epoch": 1.5496098104793758, "step": 1390}, {"loss": 1.7294, "grad_norm": 0.35525891184806824, "learning_rate": 0.0002, "epoch": 1.560758082497213, "step": 1400}, {"loss": 1.6896, "grad_norm": 0.3873274028301239, "learning_rate": 0.0002, "epoch": 1.57190635451505, "step": 1410}, {"loss": 1.7545, "grad_norm": 0.35162487626075745, "learning_rate": 0.0002, "epoch": 1.5830546265328875, "step": 1420}, {"loss": 1.7403, "grad_norm": 0.3533175587654114, "learning_rate": 0.0002, "epoch": 1.5942028985507246, "step": 1430}, {"loss": 1.7199, "grad_norm": 0.35397887229919434, "learning_rate": 0.0002, "epoch": 1.605351170568562, "step": 1440}, {"loss": 1.701, "grad_norm": 0.3539091646671295, "learning_rate": 0.0002, "epoch": 1.6164994425863992, "step": 1450}, {"loss": 1.7407, "grad_norm": 0.38557013869285583, "learning_rate": 0.0002, "epoch": 1.6276477146042363, "step": 1460}, {"loss": 1.6896, "grad_norm": 0.3591409921646118, "learning_rate": 0.0002, "epoch": 1.6387959866220736, "step": 1470}, {"loss": 1.6831, "grad_norm": 0.3776722848415375, "learning_rate": 0.0002, "epoch": 1.649944258639911, "step": 1480}, {"loss": 1.7511, "grad_norm": 0.3761521875858307, "learning_rate": 0.0002, "epoch": 1.661092530657748, "step": 1490}, {"loss": 1.7464, "grad_norm": 0.33939364552497864, "learning_rate": 0.0002, "epoch": 1.6722408026755853, "step": 1500}, {"loss": 1.6522, "grad_norm": 0.3961067795753479, "learning_rate": 0.0002, "epoch": 1.6833890746934226, "step": 1510}, {"loss": 1.7849, "grad_norm": 0.36793094873428345, "learning_rate": 0.0002, "epoch": 1.6945373467112597, "step": 1520}, {"loss": 1.7057, "grad_norm": 0.4201025068759918, "learning_rate": 0.0002, "epoch": 1.705685618729097, "step": 1530}, {"loss": 1.6656, "grad_norm": 0.382280558347702, "learning_rate": 0.0002, "epoch": 1.7168338907469343, "step": 1540}, {"loss": 1.7987, "grad_norm": 0.4504372477531433, "learning_rate": 0.0002, "epoch": 1.7279821627647713, "step": 1550}, {"loss": 1.7889, "grad_norm": 0.36121585965156555, "learning_rate": 0.0002, "epoch": 1.7391304347826086, "step": 1560}, {"loss": 1.7282, "grad_norm": 0.38416755199432373, "learning_rate": 0.0002, "epoch": 1.750278706800446, "step": 1570}, {"loss": 1.7759, "grad_norm": 0.3920411467552185, "learning_rate": 0.0002, "epoch": 1.761426978818283, "step": 1580}, {"loss": 1.7693, "grad_norm": 0.4326777756214142, "learning_rate": 0.0002, "epoch": 1.7725752508361206, "step": 1590}, {"loss": 1.6804, "grad_norm": 0.3582489490509033, "learning_rate": 0.0002, "epoch": 1.7837235228539576, "step": 1600}, {"loss": 1.706, "grad_norm": 0.36345767974853516, "learning_rate": 0.0002, "epoch": 1.7948717948717947, "step": 1610}, {"loss": 1.75, "grad_norm": 0.3951990008354187, "learning_rate": 0.0002, "epoch": 1.8060200668896322, "step": 1620}, {"loss": 1.8034, "grad_norm": 0.35174235701560974, "learning_rate": 0.0002, "epoch": 1.8171683389074693, "step": 1630}, {"loss": 1.725, "grad_norm": 0.37005263566970825, "learning_rate": 0.0002, "epoch": 1.8283166109253066, "step": 1640}, {"loss": 1.695, "grad_norm": 0.42875173687934875, "learning_rate": 0.0002, "epoch": 1.839464882943144, "step": 1650}, {"loss": 1.7589, "grad_norm": 0.3646032512187958, "learning_rate": 0.0002, "epoch": 1.850613154960981, "step": 1660}, {"loss": 1.6698, "grad_norm": 0.38111618161201477, "learning_rate": 0.0002, "epoch": 1.8617614269788183, "step": 1670}, {"loss": 1.7832, "grad_norm": 0.3825555443763733, "learning_rate": 0.0002, "epoch": 1.8729096989966556, "step": 1680}, {"loss": 1.7599, "grad_norm": 0.36418095231056213, "learning_rate": 0.0002, "epoch": 1.8840579710144927, "step": 1690}, {"loss": 1.6532, "grad_norm": 0.36551007628440857, "learning_rate": 0.0002, "epoch": 1.89520624303233, "step": 1700}, {"loss": 1.7174, "grad_norm": 0.36421480774879456, "learning_rate": 0.0002, "epoch": 1.9063545150501673, "step": 1710}, {"loss": 1.7176, "grad_norm": 0.3791242241859436, "learning_rate": 0.0002, "epoch": 1.9175027870680044, "step": 1720}, {"loss": 1.7961, "grad_norm": 0.36655193567276, "learning_rate": 0.0002, "epoch": 1.9286510590858417, "step": 1730}, {"loss": 1.7765, "grad_norm": 0.3526945412158966, "learning_rate": 0.0002, "epoch": 1.939799331103679, "step": 1740}, {"loss": 1.7047, "grad_norm": 0.41139861941337585, "learning_rate": 0.0002, "epoch": 1.950947603121516, "step": 1750}, {"loss": 1.8155, "grad_norm": 0.41757065057754517, "learning_rate": 0.0002, "epoch": 1.9620958751393534, "step": 1760}, {"loss": 1.7271, "grad_norm": 0.38956186175346375, "learning_rate": 0.0002, "epoch": 1.9732441471571907, "step": 1770}, {"loss": 1.7653, "grad_norm": 0.33891627192497253, "learning_rate": 0.0002, "epoch": 1.9843924191750277, "step": 1780}, {"loss": 1.7305, "grad_norm": 0.42879191040992737, "learning_rate": 0.0002, "epoch": 1.9955406911928653, "step": 1790}]} +{"epoch": 3.0, "step": 2691, "epoch_duration": 976.4876337051392, "total_accumulated_duration": 2917.4538950920105, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}, {"eval_loss": 1.8143481016159058, "eval_runtime": 37.921, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 1.0, "step": 897}, {"loss": 1.8029, "grad_norm": 0.29757317900657654, "learning_rate": 0.0002, "epoch": 1.0033444816053512, "step": 900}, {"loss": 1.7376, "grad_norm": 0.32168492674827576, "learning_rate": 0.0002, "epoch": 1.0144927536231885, "step": 910}, {"loss": 1.6785, "grad_norm": 0.3430717885494232, "learning_rate": 0.0002, "epoch": 1.0256410256410255, "step": 920}, {"loss": 1.7356, "grad_norm": 0.3431745767593384, "learning_rate": 0.0002, "epoch": 1.0367892976588629, "step": 930}, {"loss": 1.7932, "grad_norm": 0.39787548780441284, "learning_rate": 0.0002, "epoch": 1.0479375696767002, "step": 940}, {"loss": 1.7434, "grad_norm": 0.3540935218334198, "learning_rate": 0.0002, "epoch": 1.0590858416945372, "step": 950}, {"loss": 1.7693, "grad_norm": 0.368484765291214, "learning_rate": 0.0002, "epoch": 1.0702341137123745, "step": 960}, {"loss": 1.6887, "grad_norm": 0.41324466466903687, "learning_rate": 0.0002, "epoch": 1.0813823857302118, "step": 970}, {"loss": 1.7288, "grad_norm": 0.3696419596672058, "learning_rate": 0.0002, "epoch": 1.0925306577480491, "step": 980}, {"loss": 1.7743, "grad_norm": 0.33832886815071106, "learning_rate": 0.0002, "epoch": 1.1036789297658862, "step": 990}, {"loss": 1.7445, "grad_norm": 0.4411991834640503, "learning_rate": 0.0002, "epoch": 1.1148272017837235, "step": 1000}, {"loss": 1.7699, "grad_norm": 0.3935333788394928, "learning_rate": 0.0002, "epoch": 1.1259754738015608, "step": 1010}, {"loss": 1.6909, "grad_norm": 0.32472893595695496, "learning_rate": 0.0002, "epoch": 1.137123745819398, "step": 1020}, {"loss": 1.6974, "grad_norm": 0.3455545902252197, "learning_rate": 0.0002, "epoch": 1.1482720178372352, "step": 1030}, {"loss": 1.7555, "grad_norm": 0.3995654582977295, "learning_rate": 0.0002, "epoch": 1.1594202898550725, "step": 1040}, {"loss": 1.7419, "grad_norm": 0.384056031703949, "learning_rate": 0.0002, "epoch": 1.1705685618729098, "step": 1050}, {"loss": 1.7693, "grad_norm": 0.4345705211162567, "learning_rate": 0.0002, "epoch": 1.1817168338907469, "step": 1060}, {"loss": 1.7219, "grad_norm": 0.3524057865142822, "learning_rate": 0.0002, "epoch": 1.1928651059085842, "step": 1070}, {"loss": 1.6701, "grad_norm": 0.4047132134437561, "learning_rate": 0.0002, "epoch": 1.2040133779264215, "step": 1080}, {"loss": 1.7035, "grad_norm": 0.365824431180954, "learning_rate": 0.0002, "epoch": 1.2151616499442586, "step": 1090}, {"loss": 1.7367, "grad_norm": 0.37048354744911194, "learning_rate": 0.0002, "epoch": 1.2263099219620959, "step": 1100}, {"loss": 1.7503, "grad_norm": 0.3753672242164612, "learning_rate": 0.0002, "epoch": 1.2374581939799332, "step": 1110}, {"loss": 1.6984, "grad_norm": 0.37887042760849, "learning_rate": 0.0002, "epoch": 1.2486064659977703, "step": 1120}, {"loss": 1.7866, "grad_norm": 0.3896579444408417, "learning_rate": 0.0002, "epoch": 1.2597547380156076, "step": 1130}, {"loss": 1.8085, "grad_norm": 0.3725394010543823, "learning_rate": 0.0002, "epoch": 1.2709030100334449, "step": 1140}, {"loss": 1.6942, "grad_norm": 0.373989999294281, "learning_rate": 0.0002, "epoch": 1.282051282051282, "step": 1150}, {"loss": 1.7566, "grad_norm": 0.4412260353565216, "learning_rate": 0.0002, "epoch": 1.2931995540691192, "step": 1160}, {"loss": 1.7425, "grad_norm": 0.38538658618927, "learning_rate": 0.0002, "epoch": 1.3043478260869565, "step": 1170}, {"loss": 1.6573, "grad_norm": 0.3644104599952698, "learning_rate": 0.0002, "epoch": 1.3154960981047936, "step": 1180}, {"loss": 1.6186, "grad_norm": 0.3615347743034363, "learning_rate": 0.0002, "epoch": 1.326644370122631, "step": 1190}, {"loss": 1.7575, "grad_norm": 0.4260489046573639, "learning_rate": 0.0002, "epoch": 1.3377926421404682, "step": 1200}, {"loss": 1.762, "grad_norm": 0.35236871242523193, "learning_rate": 0.0002, "epoch": 1.3489409141583055, "step": 1210}, {"loss": 1.7207, "grad_norm": 0.45456627011299133, "learning_rate": 0.0002, "epoch": 1.3600891861761428, "step": 1220}, {"loss": 1.7391, "grad_norm": 0.391541063785553, "learning_rate": 0.0002, "epoch": 1.37123745819398, "step": 1230}, {"loss": 1.7309, "grad_norm": 0.37955328822135925, "learning_rate": 0.0002, "epoch": 1.3823857302118172, "step": 1240}, {"loss": 1.7028, "grad_norm": 0.36955225467681885, "learning_rate": 0.0002, "epoch": 1.3935340022296545, "step": 1250}, {"loss": 1.7027, "grad_norm": 0.36156216263771057, "learning_rate": 0.0002, "epoch": 1.4046822742474916, "step": 1260}, {"loss": 1.8091, "grad_norm": 0.4083487391471863, "learning_rate": 0.0002, "epoch": 1.415830546265329, "step": 1270}, {"loss": 1.7551, "grad_norm": 0.420171320438385, "learning_rate": 0.0002, "epoch": 1.4269788182831662, "step": 1280}, {"loss": 1.7377, "grad_norm": 0.3581725060939789, "learning_rate": 0.0002, "epoch": 1.4381270903010033, "step": 1290}, {"loss": 1.728, "grad_norm": 0.3657953441143036, "learning_rate": 0.0002, "epoch": 1.4492753623188406, "step": 1300}, {"loss": 1.7116, "grad_norm": 0.3139931857585907, "learning_rate": 0.0002, "epoch": 1.4604236343366779, "step": 1310}, {"loss": 1.671, "grad_norm": 0.37750574946403503, "learning_rate": 0.0002, "epoch": 1.471571906354515, "step": 1320}, {"loss": 1.7663, "grad_norm": 0.37787437438964844, "learning_rate": 0.0002, "epoch": 1.4827201783723523, "step": 1330}, {"loss": 1.6403, "grad_norm": 0.39505279064178467, "learning_rate": 0.0002, "epoch": 1.4938684503901896, "step": 1340}, {"loss": 1.7745, "grad_norm": 0.39977672696113586, "learning_rate": 0.0002, "epoch": 1.5050167224080266, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.4395383298397064, "learning_rate": 0.0002, "epoch": 1.516164994425864, "step": 1360}, {"loss": 1.7315, "grad_norm": 0.3452998995780945, "learning_rate": 0.0002, "epoch": 1.5273132664437012, "step": 1370}, {"loss": 1.7244, "grad_norm": 0.39573904871940613, "learning_rate": 0.0002, "epoch": 1.5384615384615383, "step": 1380}, {"loss": 1.7453, "grad_norm": 0.4886358976364136, "learning_rate": 0.0002, "epoch": 1.5496098104793758, "step": 1390}, {"loss": 1.7294, "grad_norm": 0.35525891184806824, "learning_rate": 0.0002, "epoch": 1.560758082497213, "step": 1400}, {"loss": 1.6896, "grad_norm": 0.3873274028301239, "learning_rate": 0.0002, "epoch": 1.57190635451505, "step": 1410}, {"loss": 1.7545, "grad_norm": 0.35162487626075745, "learning_rate": 0.0002, "epoch": 1.5830546265328875, "step": 1420}, {"loss": 1.7403, "grad_norm": 0.3533175587654114, "learning_rate": 0.0002, "epoch": 1.5942028985507246, "step": 1430}, {"loss": 1.7199, "grad_norm": 0.35397887229919434, "learning_rate": 0.0002, "epoch": 1.605351170568562, "step": 1440}, {"loss": 1.701, "grad_norm": 0.3539091646671295, "learning_rate": 0.0002, "epoch": 1.6164994425863992, "step": 1450}, {"loss": 1.7407, "grad_norm": 0.38557013869285583, "learning_rate": 0.0002, "epoch": 1.6276477146042363, "step": 1460}, {"loss": 1.6896, "grad_norm": 0.3591409921646118, "learning_rate": 0.0002, "epoch": 1.6387959866220736, "step": 1470}, {"loss": 1.6831, "grad_norm": 0.3776722848415375, "learning_rate": 0.0002, "epoch": 1.649944258639911, "step": 1480}, {"loss": 1.7511, "grad_norm": 0.3761521875858307, "learning_rate": 0.0002, "epoch": 1.661092530657748, "step": 1490}, {"loss": 1.7464, "grad_norm": 0.33939364552497864, "learning_rate": 0.0002, "epoch": 1.6722408026755853, "step": 1500}, {"loss": 1.6522, "grad_norm": 0.3961067795753479, "learning_rate": 0.0002, "epoch": 1.6833890746934226, "step": 1510}, {"loss": 1.7849, "grad_norm": 0.36793094873428345, "learning_rate": 0.0002, "epoch": 1.6945373467112597, "step": 1520}, {"loss": 1.7057, "grad_norm": 0.4201025068759918, "learning_rate": 0.0002, "epoch": 1.705685618729097, "step": 1530}, {"loss": 1.6656, "grad_norm": 0.382280558347702, "learning_rate": 0.0002, "epoch": 1.7168338907469343, "step": 1540}, {"loss": 1.7987, "grad_norm": 0.4504372477531433, "learning_rate": 0.0002, "epoch": 1.7279821627647713, "step": 1550}, {"loss": 1.7889, "grad_norm": 0.36121585965156555, "learning_rate": 0.0002, "epoch": 1.7391304347826086, "step": 1560}, {"loss": 1.7282, "grad_norm": 0.38416755199432373, "learning_rate": 0.0002, "epoch": 1.750278706800446, "step": 1570}, {"loss": 1.7759, "grad_norm": 0.3920411467552185, "learning_rate": 0.0002, "epoch": 1.761426978818283, "step": 1580}, {"loss": 1.7693, "grad_norm": 0.4326777756214142, "learning_rate": 0.0002, "epoch": 1.7725752508361206, "step": 1590}, {"loss": 1.6804, "grad_norm": 0.3582489490509033, "learning_rate": 0.0002, "epoch": 1.7837235228539576, "step": 1600}, {"loss": 1.706, "grad_norm": 0.36345767974853516, "learning_rate": 0.0002, "epoch": 1.7948717948717947, "step": 1610}, {"loss": 1.75, "grad_norm": 0.3951990008354187, "learning_rate": 0.0002, "epoch": 1.8060200668896322, "step": 1620}, {"loss": 1.8034, "grad_norm": 0.35174235701560974, "learning_rate": 0.0002, "epoch": 1.8171683389074693, "step": 1630}, {"loss": 1.725, "grad_norm": 0.37005263566970825, "learning_rate": 0.0002, "epoch": 1.8283166109253066, "step": 1640}, {"loss": 1.695, "grad_norm": 0.42875173687934875, "learning_rate": 0.0002, "epoch": 1.839464882943144, "step": 1650}, {"loss": 1.7589, "grad_norm": 0.3646032512187958, "learning_rate": 0.0002, "epoch": 1.850613154960981, "step": 1660}, {"loss": 1.6698, "grad_norm": 0.38111618161201477, "learning_rate": 0.0002, "epoch": 1.8617614269788183, "step": 1670}, {"loss": 1.7832, "grad_norm": 0.3825555443763733, "learning_rate": 0.0002, "epoch": 1.8729096989966556, "step": 1680}, {"loss": 1.7599, "grad_norm": 0.36418095231056213, "learning_rate": 0.0002, "epoch": 1.8840579710144927, "step": 1690}, {"loss": 1.6532, "grad_norm": 0.36551007628440857, "learning_rate": 0.0002, "epoch": 1.89520624303233, "step": 1700}, {"loss": 1.7174, "grad_norm": 0.36421480774879456, "learning_rate": 0.0002, "epoch": 1.9063545150501673, "step": 1710}, {"loss": 1.7176, "grad_norm": 0.3791242241859436, "learning_rate": 0.0002, "epoch": 1.9175027870680044, "step": 1720}, {"loss": 1.7961, "grad_norm": 0.36655193567276, "learning_rate": 0.0002, "epoch": 1.9286510590858417, "step": 1730}, {"loss": 1.7765, "grad_norm": 0.3526945412158966, "learning_rate": 0.0002, "epoch": 1.939799331103679, "step": 1740}, {"loss": 1.7047, "grad_norm": 0.41139861941337585, "learning_rate": 0.0002, "epoch": 1.950947603121516, "step": 1750}, {"loss": 1.8155, "grad_norm": 0.41757065057754517, "learning_rate": 0.0002, "epoch": 1.9620958751393534, "step": 1760}, {"loss": 1.7271, "grad_norm": 0.38956186175346375, "learning_rate": 0.0002, "epoch": 1.9732441471571907, "step": 1770}, {"loss": 1.7653, "grad_norm": 0.33891627192497253, "learning_rate": 0.0002, "epoch": 1.9843924191750277, "step": 1780}, {"loss": 1.7305, "grad_norm": 0.42879191040992737, "learning_rate": 0.0002, "epoch": 1.9955406911928653, "step": 1790}, {"eval_loss": 1.8116765022277832, "eval_runtime": 37.9859, "eval_samples_per_second": 13.558, "eval_steps_per_second": 1.711, "epoch": 2.0, "step": 1794}, {"loss": 1.6724, "grad_norm": 0.42103368043899536, "learning_rate": 0.0002, "epoch": 2.0066889632107023, "step": 1800}, {"loss": 1.5812, "grad_norm": 0.41505053639411926, "learning_rate": 0.0002, "epoch": 2.0178372352285394, "step": 1810}, {"loss": 1.6132, "grad_norm": 0.398190438747406, "learning_rate": 0.0002, "epoch": 2.028985507246377, "step": 1820}, {"loss": 1.6497, "grad_norm": 0.4371621310710907, "learning_rate": 0.0002, "epoch": 2.040133779264214, "step": 1830}, {"loss": 1.6501, "grad_norm": 0.45679208636283875, "learning_rate": 0.0002, "epoch": 2.051282051282051, "step": 1840}, {"loss": 1.5773, "grad_norm": 0.43211811780929565, "learning_rate": 0.0002, "epoch": 2.0624303232998886, "step": 1850}, {"loss": 1.6414, "grad_norm": 0.47492915391921997, "learning_rate": 0.0002, "epoch": 2.0735785953177257, "step": 1860}, {"loss": 1.7169, "grad_norm": 0.41742339730262756, "learning_rate": 0.0002, "epoch": 2.084726867335563, "step": 1870}, {"loss": 1.5762, "grad_norm": 0.45789217948913574, "learning_rate": 0.0002, "epoch": 2.0958751393534003, "step": 1880}, {"loss": 1.6896, "grad_norm": 0.43958935141563416, "learning_rate": 0.0002, "epoch": 2.1070234113712374, "step": 1890}, {"loss": 1.6444, "grad_norm": 0.43991968035697937, "learning_rate": 0.0002, "epoch": 2.1181716833890745, "step": 1900}, {"loss": 1.6057, "grad_norm": 0.4667953848838806, "learning_rate": 0.0002, "epoch": 2.129319955406912, "step": 1910}, {"loss": 1.5999, "grad_norm": 0.42225760221481323, "learning_rate": 0.0002, "epoch": 2.140468227424749, "step": 1920}, {"loss": 1.6525, "grad_norm": 0.418850839138031, "learning_rate": 0.0002, "epoch": 2.1516164994425866, "step": 1930}, {"loss": 1.6091, "grad_norm": 0.43838515877723694, "learning_rate": 0.0002, "epoch": 2.1627647714604237, "step": 1940}, {"loss": 1.6837, "grad_norm": 0.43798115849494934, "learning_rate": 0.0002, "epoch": 2.1739130434782608, "step": 1950}, {"loss": 1.632, "grad_norm": 0.4456610679626465, "learning_rate": 0.0002, "epoch": 2.1850613154960983, "step": 1960}, {"loss": 1.6338, "grad_norm": 0.4619026482105255, "learning_rate": 0.0002, "epoch": 2.1962095875139354, "step": 1970}, {"loss": 1.6989, "grad_norm": 0.4732453525066376, "learning_rate": 0.0002, "epoch": 2.2073578595317724, "step": 1980}, {"loss": 1.581, "grad_norm": 0.42551836371421814, "learning_rate": 0.0002, "epoch": 2.21850613154961, "step": 1990}, {"loss": 1.6386, "grad_norm": 0.45154353976249695, "learning_rate": 0.0002, "epoch": 2.229654403567447, "step": 2000}, {"loss": 1.6768, "grad_norm": 0.4655696451663971, "learning_rate": 0.0002, "epoch": 2.240802675585284, "step": 2010}, {"loss": 1.6972, "grad_norm": 0.5363447666168213, "learning_rate": 0.0002, "epoch": 2.2519509476031216, "step": 2020}, {"loss": 1.6561, "grad_norm": 0.4839927852153778, "learning_rate": 0.0002, "epoch": 2.2630992196209587, "step": 2030}, {"loss": 1.6838, "grad_norm": 0.4639221727848053, "learning_rate": 0.0002, "epoch": 2.274247491638796, "step": 2040}, {"loss": 1.6063, "grad_norm": 0.46169278025627136, "learning_rate": 0.0002, "epoch": 2.2853957636566333, "step": 2050}, {"loss": 1.5924, "grad_norm": 0.4582304060459137, "learning_rate": 0.0002, "epoch": 2.2965440356744704, "step": 2060}, {"loss": 1.5778, "grad_norm": 0.48619818687438965, "learning_rate": 0.0002, "epoch": 2.3076923076923075, "step": 2070}, {"loss": 1.633, "grad_norm": 0.4382200241088867, "learning_rate": 0.0002, "epoch": 2.318840579710145, "step": 2080}, {"loss": 1.5854, "grad_norm": 0.4103265106678009, "learning_rate": 0.0002, "epoch": 2.329988851727982, "step": 2090}, {"loss": 1.7042, "grad_norm": 0.5136023759841919, "learning_rate": 0.0002, "epoch": 2.3411371237458196, "step": 2100}, {"loss": 1.5723, "grad_norm": 0.46723702549934387, "learning_rate": 0.0002, "epoch": 2.3522853957636567, "step": 2110}, {"loss": 1.6852, "grad_norm": 0.42269468307495117, "learning_rate": 0.0002, "epoch": 2.3634336677814938, "step": 2120}, {"loss": 1.6369, "grad_norm": 0.42611163854599, "learning_rate": 0.0002, "epoch": 2.374581939799331, "step": 2130}, {"loss": 1.5879, "grad_norm": 0.4573901891708374, "learning_rate": 0.0002, "epoch": 2.3857302118171684, "step": 2140}, {"loss": 1.6317, "grad_norm": 0.4758673310279846, "learning_rate": 0.0002, "epoch": 2.3968784838350055, "step": 2150}, {"loss": 1.6527, "grad_norm": 0.49616846442222595, "learning_rate": 0.0002, "epoch": 2.408026755852843, "step": 2160}, {"loss": 1.5796, "grad_norm": 0.5278240442276001, "learning_rate": 0.0002, "epoch": 2.41917502787068, "step": 2170}, {"loss": 1.6746, "grad_norm": 0.46806028485298157, "learning_rate": 0.0002, "epoch": 2.430323299888517, "step": 2180}, {"loss": 1.676, "grad_norm": 0.44507312774658203, "learning_rate": 0.0002, "epoch": 2.4414715719063547, "step": 2190}, {"loss": 1.6793, "grad_norm": 0.45716050267219543, "learning_rate": 0.0002, "epoch": 2.4526198439241917, "step": 2200}, {"loss": 1.6198, "grad_norm": 0.4226573705673218, "learning_rate": 0.0002, "epoch": 2.463768115942029, "step": 2210}, {"loss": 1.5721, "grad_norm": 0.4488418400287628, "learning_rate": 0.0002, "epoch": 2.4749163879598663, "step": 2220}, {"loss": 1.6399, "grad_norm": 0.48324450850486755, "learning_rate": 0.0002, "epoch": 2.4860646599777034, "step": 2230}, {"loss": 1.6228, "grad_norm": 0.4866982400417328, "learning_rate": 0.0002, "epoch": 2.4972129319955405, "step": 2240}, {"loss": 1.6887, "grad_norm": 0.4784172773361206, "learning_rate": 0.0002, "epoch": 2.508361204013378, "step": 2250}, {"loss": 1.6905, "grad_norm": 0.4250621199607849, "learning_rate": 0.0002, "epoch": 2.519509476031215, "step": 2260}, {"loss": 1.6582, "grad_norm": 0.431224524974823, "learning_rate": 0.0002, "epoch": 2.5306577480490526, "step": 2270}, {"loss": 1.5981, "grad_norm": 0.3931371867656708, "learning_rate": 0.0002, "epoch": 2.5418060200668897, "step": 2280}, {"loss": 1.6897, "grad_norm": 0.4800887703895569, "learning_rate": 0.0002, "epoch": 2.552954292084727, "step": 2290}, {"loss": 1.6205, "grad_norm": 0.4288487136363983, "learning_rate": 0.0002, "epoch": 2.564102564102564, "step": 2300}, {"loss": 1.6005, "grad_norm": 0.48489660024642944, "learning_rate": 0.0002, "epoch": 2.5752508361204014, "step": 2310}, {"loss": 1.6447, "grad_norm": 0.4221740961074829, "learning_rate": 0.0002, "epoch": 2.5863991081382385, "step": 2320}, {"loss": 1.666, "grad_norm": 0.4413852393627167, "learning_rate": 0.0002, "epoch": 2.597547380156076, "step": 2330}, {"loss": 1.6863, "grad_norm": 0.4391345679759979, "learning_rate": 0.0002, "epoch": 2.608695652173913, "step": 2340}, {"loss": 1.6942, "grad_norm": 0.4824720323085785, "learning_rate": 0.0002, "epoch": 2.61984392419175, "step": 2350}, {"loss": 1.5615, "grad_norm": 0.4023158550262451, "learning_rate": 0.0002, "epoch": 2.6309921962095872, "step": 2360}, {"loss": 1.698, "grad_norm": 0.5107841491699219, "learning_rate": 0.0002, "epoch": 2.6421404682274248, "step": 2370}, {"loss": 1.6258, "grad_norm": 0.4705312252044678, "learning_rate": 0.0002, "epoch": 2.653288740245262, "step": 2380}, {"loss": 1.7294, "grad_norm": 0.4420899450778961, "learning_rate": 0.0002, "epoch": 2.6644370122630994, "step": 2390}, {"loss": 1.6246, "grad_norm": 0.413308709859848, "learning_rate": 0.0002, "epoch": 2.6755852842809364, "step": 2400}, {"loss": 1.565, "grad_norm": 0.4312658905982971, "learning_rate": 0.0002, "epoch": 2.6867335562987735, "step": 2410}, {"loss": 1.617, "grad_norm": 0.44714513421058655, "learning_rate": 0.0002, "epoch": 2.697881828316611, "step": 2420}, {"loss": 1.6185, "grad_norm": 0.49152931571006775, "learning_rate": 0.0002, "epoch": 2.709030100334448, "step": 2430}, {"loss": 1.5864, "grad_norm": 0.49458765983581543, "learning_rate": 0.0002, "epoch": 2.7201783723522857, "step": 2440}, {"loss": 1.6535, "grad_norm": 0.47838348150253296, "learning_rate": 0.0002, "epoch": 2.7313266443701227, "step": 2450}, {"loss": 1.6836, "grad_norm": 0.5781240463256836, "learning_rate": 0.0002, "epoch": 2.74247491638796, "step": 2460}, {"loss": 1.6141, "grad_norm": 0.4559851884841919, "learning_rate": 0.0002, "epoch": 2.753623188405797, "step": 2470}, {"loss": 1.5589, "grad_norm": 0.4452647566795349, "learning_rate": 0.0002, "epoch": 2.7647714604236344, "step": 2480}, {"loss": 1.6209, "grad_norm": 0.43920454382896423, "learning_rate": 0.0002, "epoch": 2.7759197324414715, "step": 2490}, {"loss": 1.5593, "grad_norm": 0.467780739068985, "learning_rate": 0.0002, "epoch": 2.787068004459309, "step": 2500}, {"loss": 1.6438, "grad_norm": 0.4743262529373169, "learning_rate": 0.0002, "epoch": 2.798216276477146, "step": 2510}, {"loss": 1.6084, "grad_norm": 0.47944432497024536, "learning_rate": 0.0002, "epoch": 2.809364548494983, "step": 2520}, {"loss": 1.6756, "grad_norm": 0.48032790422439575, "learning_rate": 0.0002, "epoch": 2.8205128205128203, "step": 2530}, {"loss": 1.6222, "grad_norm": 0.45569729804992676, "learning_rate": 0.0002, "epoch": 2.831661092530658, "step": 2540}, {"loss": 1.6187, "grad_norm": 0.47940587997436523, "learning_rate": 0.0002, "epoch": 2.842809364548495, "step": 2550}, {"loss": 1.6286, "grad_norm": 0.5215432047843933, "learning_rate": 0.0002, "epoch": 2.8539576365663324, "step": 2560}, {"loss": 1.6718, "grad_norm": 0.4421178102493286, "learning_rate": 0.0002, "epoch": 2.8651059085841695, "step": 2570}, {"loss": 1.6201, "grad_norm": 0.45288747549057007, "learning_rate": 0.0002, "epoch": 2.8762541806020065, "step": 2580}, {"loss": 1.5938, "grad_norm": 0.4472251832485199, "learning_rate": 0.0002, "epoch": 2.887402452619844, "step": 2590}, {"loss": 1.7212, "grad_norm": 0.4396503269672394, "learning_rate": 0.0002, "epoch": 2.898550724637681, "step": 2600}, {"loss": 1.6503, "grad_norm": 0.48590990900993347, "learning_rate": 0.0002, "epoch": 2.9096989966555182, "step": 2610}, {"loss": 1.5914, "grad_norm": 0.4787760376930237, "learning_rate": 0.0002, "epoch": 2.9208472686733558, "step": 2620}, {"loss": 1.717, "grad_norm": 0.4807611107826233, "learning_rate": 0.0002, "epoch": 2.931995540691193, "step": 2630}, {"loss": 1.6794, "grad_norm": 0.4625583291053772, "learning_rate": 0.0002, "epoch": 2.94314381270903, "step": 2640}, {"loss": 1.663, "grad_norm": 0.4163573980331421, "learning_rate": 0.0002, "epoch": 2.9542920847268674, "step": 2650}, {"loss": 1.6321, "grad_norm": 0.5142832398414612, "learning_rate": 0.0002, "epoch": 2.9654403567447045, "step": 2660}, {"loss": 1.6183, "grad_norm": 0.4459492564201355, "learning_rate": 0.0002, "epoch": 2.976588628762542, "step": 2670}, {"loss": 1.662, "grad_norm": 0.42905503511428833, "learning_rate": 0.0002, "epoch": 2.987736900780379, "step": 2680}, {"loss": 1.6796, "grad_norm": 0.44594648480415344, "learning_rate": 0.0002, "epoch": 2.998885172798216, "step": 2690}]} +{"epoch": 4.0, "step": 3588, "epoch_duration": 1003.2108914852142, "total_accumulated_duration": 3920.6647865772247, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}, {"eval_loss": 1.8143481016159058, "eval_runtime": 37.921, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 1.0, "step": 897}, {"loss": 1.8029, "grad_norm": 0.29757317900657654, "learning_rate": 0.0002, "epoch": 1.0033444816053512, "step": 900}, {"loss": 1.7376, "grad_norm": 0.32168492674827576, "learning_rate": 0.0002, "epoch": 1.0144927536231885, "step": 910}, {"loss": 1.6785, "grad_norm": 0.3430717885494232, "learning_rate": 0.0002, "epoch": 1.0256410256410255, "step": 920}, {"loss": 1.7356, "grad_norm": 0.3431745767593384, "learning_rate": 0.0002, "epoch": 1.0367892976588629, "step": 930}, {"loss": 1.7932, "grad_norm": 0.39787548780441284, "learning_rate": 0.0002, "epoch": 1.0479375696767002, "step": 940}, {"loss": 1.7434, "grad_norm": 0.3540935218334198, "learning_rate": 0.0002, "epoch": 1.0590858416945372, "step": 950}, {"loss": 1.7693, "grad_norm": 0.368484765291214, "learning_rate": 0.0002, "epoch": 1.0702341137123745, "step": 960}, {"loss": 1.6887, "grad_norm": 0.41324466466903687, "learning_rate": 0.0002, "epoch": 1.0813823857302118, "step": 970}, {"loss": 1.7288, "grad_norm": 0.3696419596672058, "learning_rate": 0.0002, "epoch": 1.0925306577480491, "step": 980}, {"loss": 1.7743, "grad_norm": 0.33832886815071106, "learning_rate": 0.0002, "epoch": 1.1036789297658862, "step": 990}, {"loss": 1.7445, "grad_norm": 0.4411991834640503, "learning_rate": 0.0002, "epoch": 1.1148272017837235, "step": 1000}, {"loss": 1.7699, "grad_norm": 0.3935333788394928, "learning_rate": 0.0002, "epoch": 1.1259754738015608, "step": 1010}, {"loss": 1.6909, "grad_norm": 0.32472893595695496, "learning_rate": 0.0002, "epoch": 1.137123745819398, "step": 1020}, {"loss": 1.6974, "grad_norm": 0.3455545902252197, "learning_rate": 0.0002, "epoch": 1.1482720178372352, "step": 1030}, {"loss": 1.7555, "grad_norm": 0.3995654582977295, "learning_rate": 0.0002, "epoch": 1.1594202898550725, "step": 1040}, {"loss": 1.7419, "grad_norm": 0.384056031703949, "learning_rate": 0.0002, "epoch": 1.1705685618729098, "step": 1050}, {"loss": 1.7693, "grad_norm": 0.4345705211162567, "learning_rate": 0.0002, "epoch": 1.1817168338907469, "step": 1060}, {"loss": 1.7219, "grad_norm": 0.3524057865142822, "learning_rate": 0.0002, "epoch": 1.1928651059085842, "step": 1070}, {"loss": 1.6701, "grad_norm": 0.4047132134437561, "learning_rate": 0.0002, "epoch": 1.2040133779264215, "step": 1080}, {"loss": 1.7035, "grad_norm": 0.365824431180954, "learning_rate": 0.0002, "epoch": 1.2151616499442586, "step": 1090}, {"loss": 1.7367, "grad_norm": 0.37048354744911194, "learning_rate": 0.0002, "epoch": 1.2263099219620959, "step": 1100}, {"loss": 1.7503, "grad_norm": 0.3753672242164612, "learning_rate": 0.0002, "epoch": 1.2374581939799332, "step": 1110}, {"loss": 1.6984, "grad_norm": 0.37887042760849, "learning_rate": 0.0002, "epoch": 1.2486064659977703, "step": 1120}, {"loss": 1.7866, "grad_norm": 0.3896579444408417, "learning_rate": 0.0002, "epoch": 1.2597547380156076, "step": 1130}, {"loss": 1.8085, "grad_norm": 0.3725394010543823, "learning_rate": 0.0002, "epoch": 1.2709030100334449, "step": 1140}, {"loss": 1.6942, "grad_norm": 0.373989999294281, "learning_rate": 0.0002, "epoch": 1.282051282051282, "step": 1150}, {"loss": 1.7566, "grad_norm": 0.4412260353565216, "learning_rate": 0.0002, "epoch": 1.2931995540691192, "step": 1160}, {"loss": 1.7425, "grad_norm": 0.38538658618927, "learning_rate": 0.0002, "epoch": 1.3043478260869565, "step": 1170}, {"loss": 1.6573, "grad_norm": 0.3644104599952698, "learning_rate": 0.0002, "epoch": 1.3154960981047936, "step": 1180}, {"loss": 1.6186, "grad_norm": 0.3615347743034363, "learning_rate": 0.0002, "epoch": 1.326644370122631, "step": 1190}, {"loss": 1.7575, "grad_norm": 0.4260489046573639, "learning_rate": 0.0002, "epoch": 1.3377926421404682, "step": 1200}, {"loss": 1.762, "grad_norm": 0.35236871242523193, "learning_rate": 0.0002, "epoch": 1.3489409141583055, "step": 1210}, {"loss": 1.7207, "grad_norm": 0.45456627011299133, "learning_rate": 0.0002, "epoch": 1.3600891861761428, "step": 1220}, {"loss": 1.7391, "grad_norm": 0.391541063785553, "learning_rate": 0.0002, "epoch": 1.37123745819398, "step": 1230}, {"loss": 1.7309, "grad_norm": 0.37955328822135925, "learning_rate": 0.0002, "epoch": 1.3823857302118172, "step": 1240}, {"loss": 1.7028, "grad_norm": 0.36955225467681885, "learning_rate": 0.0002, "epoch": 1.3935340022296545, "step": 1250}, {"loss": 1.7027, "grad_norm": 0.36156216263771057, "learning_rate": 0.0002, "epoch": 1.4046822742474916, "step": 1260}, {"loss": 1.8091, "grad_norm": 0.4083487391471863, "learning_rate": 0.0002, "epoch": 1.415830546265329, "step": 1270}, {"loss": 1.7551, "grad_norm": 0.420171320438385, "learning_rate": 0.0002, "epoch": 1.4269788182831662, "step": 1280}, {"loss": 1.7377, "grad_norm": 0.3581725060939789, "learning_rate": 0.0002, "epoch": 1.4381270903010033, "step": 1290}, {"loss": 1.728, "grad_norm": 0.3657953441143036, "learning_rate": 0.0002, "epoch": 1.4492753623188406, "step": 1300}, {"loss": 1.7116, "grad_norm": 0.3139931857585907, "learning_rate": 0.0002, "epoch": 1.4604236343366779, "step": 1310}, {"loss": 1.671, "grad_norm": 0.37750574946403503, "learning_rate": 0.0002, "epoch": 1.471571906354515, "step": 1320}, {"loss": 1.7663, "grad_norm": 0.37787437438964844, "learning_rate": 0.0002, "epoch": 1.4827201783723523, "step": 1330}, {"loss": 1.6403, "grad_norm": 0.39505279064178467, "learning_rate": 0.0002, "epoch": 1.4938684503901896, "step": 1340}, {"loss": 1.7745, "grad_norm": 0.39977672696113586, "learning_rate": 0.0002, "epoch": 1.5050167224080266, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.4395383298397064, "learning_rate": 0.0002, "epoch": 1.516164994425864, "step": 1360}, {"loss": 1.7315, "grad_norm": 0.3452998995780945, "learning_rate": 0.0002, "epoch": 1.5273132664437012, "step": 1370}, {"loss": 1.7244, "grad_norm": 0.39573904871940613, "learning_rate": 0.0002, "epoch": 1.5384615384615383, "step": 1380}, {"loss": 1.7453, "grad_norm": 0.4886358976364136, "learning_rate": 0.0002, "epoch": 1.5496098104793758, "step": 1390}, {"loss": 1.7294, "grad_norm": 0.35525891184806824, "learning_rate": 0.0002, "epoch": 1.560758082497213, "step": 1400}, {"loss": 1.6896, "grad_norm": 0.3873274028301239, "learning_rate": 0.0002, "epoch": 1.57190635451505, "step": 1410}, {"loss": 1.7545, "grad_norm": 0.35162487626075745, "learning_rate": 0.0002, "epoch": 1.5830546265328875, "step": 1420}, {"loss": 1.7403, "grad_norm": 0.3533175587654114, "learning_rate": 0.0002, "epoch": 1.5942028985507246, "step": 1430}, {"loss": 1.7199, "grad_norm": 0.35397887229919434, "learning_rate": 0.0002, "epoch": 1.605351170568562, "step": 1440}, {"loss": 1.701, "grad_norm": 0.3539091646671295, "learning_rate": 0.0002, "epoch": 1.6164994425863992, "step": 1450}, {"loss": 1.7407, "grad_norm": 0.38557013869285583, "learning_rate": 0.0002, "epoch": 1.6276477146042363, "step": 1460}, {"loss": 1.6896, "grad_norm": 0.3591409921646118, "learning_rate": 0.0002, "epoch": 1.6387959866220736, "step": 1470}, {"loss": 1.6831, "grad_norm": 0.3776722848415375, "learning_rate": 0.0002, "epoch": 1.649944258639911, "step": 1480}, {"loss": 1.7511, "grad_norm": 0.3761521875858307, "learning_rate": 0.0002, "epoch": 1.661092530657748, "step": 1490}, {"loss": 1.7464, "grad_norm": 0.33939364552497864, "learning_rate": 0.0002, "epoch": 1.6722408026755853, "step": 1500}, {"loss": 1.6522, "grad_norm": 0.3961067795753479, "learning_rate": 0.0002, "epoch": 1.6833890746934226, "step": 1510}, {"loss": 1.7849, "grad_norm": 0.36793094873428345, "learning_rate": 0.0002, "epoch": 1.6945373467112597, "step": 1520}, {"loss": 1.7057, "grad_norm": 0.4201025068759918, "learning_rate": 0.0002, "epoch": 1.705685618729097, "step": 1530}, {"loss": 1.6656, "grad_norm": 0.382280558347702, "learning_rate": 0.0002, "epoch": 1.7168338907469343, "step": 1540}, {"loss": 1.7987, "grad_norm": 0.4504372477531433, "learning_rate": 0.0002, "epoch": 1.7279821627647713, "step": 1550}, {"loss": 1.7889, "grad_norm": 0.36121585965156555, "learning_rate": 0.0002, "epoch": 1.7391304347826086, "step": 1560}, {"loss": 1.7282, "grad_norm": 0.38416755199432373, "learning_rate": 0.0002, "epoch": 1.750278706800446, "step": 1570}, {"loss": 1.7759, "grad_norm": 0.3920411467552185, "learning_rate": 0.0002, "epoch": 1.761426978818283, "step": 1580}, {"loss": 1.7693, "grad_norm": 0.4326777756214142, "learning_rate": 0.0002, "epoch": 1.7725752508361206, "step": 1590}, {"loss": 1.6804, "grad_norm": 0.3582489490509033, "learning_rate": 0.0002, "epoch": 1.7837235228539576, "step": 1600}, {"loss": 1.706, "grad_norm": 0.36345767974853516, "learning_rate": 0.0002, "epoch": 1.7948717948717947, "step": 1610}, {"loss": 1.75, "grad_norm": 0.3951990008354187, "learning_rate": 0.0002, "epoch": 1.8060200668896322, "step": 1620}, {"loss": 1.8034, "grad_norm": 0.35174235701560974, "learning_rate": 0.0002, "epoch": 1.8171683389074693, "step": 1630}, {"loss": 1.725, "grad_norm": 0.37005263566970825, "learning_rate": 0.0002, "epoch": 1.8283166109253066, "step": 1640}, {"loss": 1.695, "grad_norm": 0.42875173687934875, "learning_rate": 0.0002, "epoch": 1.839464882943144, "step": 1650}, {"loss": 1.7589, "grad_norm": 0.3646032512187958, "learning_rate": 0.0002, "epoch": 1.850613154960981, "step": 1660}, {"loss": 1.6698, "grad_norm": 0.38111618161201477, "learning_rate": 0.0002, "epoch": 1.8617614269788183, "step": 1670}, {"loss": 1.7832, "grad_norm": 0.3825555443763733, "learning_rate": 0.0002, "epoch": 1.8729096989966556, "step": 1680}, {"loss": 1.7599, "grad_norm": 0.36418095231056213, "learning_rate": 0.0002, "epoch": 1.8840579710144927, "step": 1690}, {"loss": 1.6532, "grad_norm": 0.36551007628440857, "learning_rate": 0.0002, "epoch": 1.89520624303233, "step": 1700}, {"loss": 1.7174, "grad_norm": 0.36421480774879456, "learning_rate": 0.0002, "epoch": 1.9063545150501673, "step": 1710}, {"loss": 1.7176, "grad_norm": 0.3791242241859436, "learning_rate": 0.0002, "epoch": 1.9175027870680044, "step": 1720}, {"loss": 1.7961, "grad_norm": 0.36655193567276, "learning_rate": 0.0002, "epoch": 1.9286510590858417, "step": 1730}, {"loss": 1.7765, "grad_norm": 0.3526945412158966, "learning_rate": 0.0002, "epoch": 1.939799331103679, "step": 1740}, {"loss": 1.7047, "grad_norm": 0.41139861941337585, "learning_rate": 0.0002, "epoch": 1.950947603121516, "step": 1750}, {"loss": 1.8155, "grad_norm": 0.41757065057754517, "learning_rate": 0.0002, "epoch": 1.9620958751393534, "step": 1760}, {"loss": 1.7271, "grad_norm": 0.38956186175346375, "learning_rate": 0.0002, "epoch": 1.9732441471571907, "step": 1770}, {"loss": 1.7653, "grad_norm": 0.33891627192497253, "learning_rate": 0.0002, "epoch": 1.9843924191750277, "step": 1780}, {"loss": 1.7305, "grad_norm": 0.42879191040992737, "learning_rate": 0.0002, "epoch": 1.9955406911928653, "step": 1790}, {"eval_loss": 1.8116765022277832, "eval_runtime": 37.9859, "eval_samples_per_second": 13.558, "eval_steps_per_second": 1.711, "epoch": 2.0, "step": 1794}, {"loss": 1.6724, "grad_norm": 0.42103368043899536, "learning_rate": 0.0002, "epoch": 2.0066889632107023, "step": 1800}, {"loss": 1.5812, "grad_norm": 0.41505053639411926, "learning_rate": 0.0002, "epoch": 2.0178372352285394, "step": 1810}, {"loss": 1.6132, "grad_norm": 0.398190438747406, "learning_rate": 0.0002, "epoch": 2.028985507246377, "step": 1820}, {"loss": 1.6497, "grad_norm": 0.4371621310710907, "learning_rate": 0.0002, "epoch": 2.040133779264214, "step": 1830}, {"loss": 1.6501, "grad_norm": 0.45679208636283875, "learning_rate": 0.0002, "epoch": 2.051282051282051, "step": 1840}, {"loss": 1.5773, "grad_norm": 0.43211811780929565, "learning_rate": 0.0002, "epoch": 2.0624303232998886, "step": 1850}, {"loss": 1.6414, "grad_norm": 0.47492915391921997, "learning_rate": 0.0002, "epoch": 2.0735785953177257, "step": 1860}, {"loss": 1.7169, "grad_norm": 0.41742339730262756, "learning_rate": 0.0002, "epoch": 2.084726867335563, "step": 1870}, {"loss": 1.5762, "grad_norm": 0.45789217948913574, "learning_rate": 0.0002, "epoch": 2.0958751393534003, "step": 1880}, {"loss": 1.6896, "grad_norm": 0.43958935141563416, "learning_rate": 0.0002, "epoch": 2.1070234113712374, "step": 1890}, {"loss": 1.6444, "grad_norm": 0.43991968035697937, "learning_rate": 0.0002, "epoch": 2.1181716833890745, "step": 1900}, {"loss": 1.6057, "grad_norm": 0.4667953848838806, "learning_rate": 0.0002, "epoch": 2.129319955406912, "step": 1910}, {"loss": 1.5999, "grad_norm": 0.42225760221481323, "learning_rate": 0.0002, "epoch": 2.140468227424749, "step": 1920}, {"loss": 1.6525, "grad_norm": 0.418850839138031, "learning_rate": 0.0002, "epoch": 2.1516164994425866, "step": 1930}, {"loss": 1.6091, "grad_norm": 0.43838515877723694, "learning_rate": 0.0002, "epoch": 2.1627647714604237, "step": 1940}, {"loss": 1.6837, "grad_norm": 0.43798115849494934, "learning_rate": 0.0002, "epoch": 2.1739130434782608, "step": 1950}, {"loss": 1.632, "grad_norm": 0.4456610679626465, "learning_rate": 0.0002, "epoch": 2.1850613154960983, "step": 1960}, {"loss": 1.6338, "grad_norm": 0.4619026482105255, "learning_rate": 0.0002, "epoch": 2.1962095875139354, "step": 1970}, {"loss": 1.6989, "grad_norm": 0.4732453525066376, "learning_rate": 0.0002, "epoch": 2.2073578595317724, "step": 1980}, {"loss": 1.581, "grad_norm": 0.42551836371421814, "learning_rate": 0.0002, "epoch": 2.21850613154961, "step": 1990}, {"loss": 1.6386, "grad_norm": 0.45154353976249695, "learning_rate": 0.0002, "epoch": 2.229654403567447, "step": 2000}, {"loss": 1.6768, "grad_norm": 0.4655696451663971, "learning_rate": 0.0002, "epoch": 2.240802675585284, "step": 2010}, {"loss": 1.6972, "grad_norm": 0.5363447666168213, "learning_rate": 0.0002, "epoch": 2.2519509476031216, "step": 2020}, {"loss": 1.6561, "grad_norm": 0.4839927852153778, "learning_rate": 0.0002, "epoch": 2.2630992196209587, "step": 2030}, {"loss": 1.6838, "grad_norm": 0.4639221727848053, "learning_rate": 0.0002, "epoch": 2.274247491638796, "step": 2040}, {"loss": 1.6063, "grad_norm": 0.46169278025627136, "learning_rate": 0.0002, "epoch": 2.2853957636566333, "step": 2050}, {"loss": 1.5924, "grad_norm": 0.4582304060459137, "learning_rate": 0.0002, "epoch": 2.2965440356744704, "step": 2060}, {"loss": 1.5778, "grad_norm": 0.48619818687438965, "learning_rate": 0.0002, "epoch": 2.3076923076923075, "step": 2070}, {"loss": 1.633, "grad_norm": 0.4382200241088867, "learning_rate": 0.0002, "epoch": 2.318840579710145, "step": 2080}, {"loss": 1.5854, "grad_norm": 0.4103265106678009, "learning_rate": 0.0002, "epoch": 2.329988851727982, "step": 2090}, {"loss": 1.7042, "grad_norm": 0.5136023759841919, "learning_rate": 0.0002, "epoch": 2.3411371237458196, "step": 2100}, {"loss": 1.5723, "grad_norm": 0.46723702549934387, "learning_rate": 0.0002, "epoch": 2.3522853957636567, "step": 2110}, {"loss": 1.6852, "grad_norm": 0.42269468307495117, "learning_rate": 0.0002, "epoch": 2.3634336677814938, "step": 2120}, {"loss": 1.6369, "grad_norm": 0.42611163854599, "learning_rate": 0.0002, "epoch": 2.374581939799331, "step": 2130}, {"loss": 1.5879, "grad_norm": 0.4573901891708374, "learning_rate": 0.0002, "epoch": 2.3857302118171684, "step": 2140}, {"loss": 1.6317, "grad_norm": 0.4758673310279846, "learning_rate": 0.0002, "epoch": 2.3968784838350055, "step": 2150}, {"loss": 1.6527, "grad_norm": 0.49616846442222595, "learning_rate": 0.0002, "epoch": 2.408026755852843, "step": 2160}, {"loss": 1.5796, "grad_norm": 0.5278240442276001, "learning_rate": 0.0002, "epoch": 2.41917502787068, "step": 2170}, {"loss": 1.6746, "grad_norm": 0.46806028485298157, "learning_rate": 0.0002, "epoch": 2.430323299888517, "step": 2180}, {"loss": 1.676, "grad_norm": 0.44507312774658203, "learning_rate": 0.0002, "epoch": 2.4414715719063547, "step": 2190}, {"loss": 1.6793, "grad_norm": 0.45716050267219543, "learning_rate": 0.0002, "epoch": 2.4526198439241917, "step": 2200}, {"loss": 1.6198, "grad_norm": 0.4226573705673218, "learning_rate": 0.0002, "epoch": 2.463768115942029, "step": 2210}, {"loss": 1.5721, "grad_norm": 0.4488418400287628, "learning_rate": 0.0002, "epoch": 2.4749163879598663, "step": 2220}, {"loss": 1.6399, "grad_norm": 0.48324450850486755, "learning_rate": 0.0002, "epoch": 2.4860646599777034, "step": 2230}, {"loss": 1.6228, "grad_norm": 0.4866982400417328, "learning_rate": 0.0002, "epoch": 2.4972129319955405, "step": 2240}, {"loss": 1.6887, "grad_norm": 0.4784172773361206, "learning_rate": 0.0002, "epoch": 2.508361204013378, "step": 2250}, {"loss": 1.6905, "grad_norm": 0.4250621199607849, "learning_rate": 0.0002, "epoch": 2.519509476031215, "step": 2260}, {"loss": 1.6582, "grad_norm": 0.431224524974823, "learning_rate": 0.0002, "epoch": 2.5306577480490526, "step": 2270}, {"loss": 1.5981, "grad_norm": 0.3931371867656708, "learning_rate": 0.0002, "epoch": 2.5418060200668897, "step": 2280}, {"loss": 1.6897, "grad_norm": 0.4800887703895569, "learning_rate": 0.0002, "epoch": 2.552954292084727, "step": 2290}, {"loss": 1.6205, "grad_norm": 0.4288487136363983, "learning_rate": 0.0002, "epoch": 2.564102564102564, "step": 2300}, {"loss": 1.6005, "grad_norm": 0.48489660024642944, "learning_rate": 0.0002, "epoch": 2.5752508361204014, "step": 2310}, {"loss": 1.6447, "grad_norm": 0.4221740961074829, "learning_rate": 0.0002, "epoch": 2.5863991081382385, "step": 2320}, {"loss": 1.666, "grad_norm": 0.4413852393627167, "learning_rate": 0.0002, "epoch": 2.597547380156076, "step": 2330}, {"loss": 1.6863, "grad_norm": 0.4391345679759979, "learning_rate": 0.0002, "epoch": 2.608695652173913, "step": 2340}, {"loss": 1.6942, "grad_norm": 0.4824720323085785, "learning_rate": 0.0002, "epoch": 2.61984392419175, "step": 2350}, {"loss": 1.5615, "grad_norm": 0.4023158550262451, "learning_rate": 0.0002, "epoch": 2.6309921962095872, "step": 2360}, {"loss": 1.698, "grad_norm": 0.5107841491699219, "learning_rate": 0.0002, "epoch": 2.6421404682274248, "step": 2370}, {"loss": 1.6258, "grad_norm": 0.4705312252044678, "learning_rate": 0.0002, "epoch": 2.653288740245262, "step": 2380}, {"loss": 1.7294, "grad_norm": 0.4420899450778961, "learning_rate": 0.0002, "epoch": 2.6644370122630994, "step": 2390}, {"loss": 1.6246, "grad_norm": 0.413308709859848, "learning_rate": 0.0002, "epoch": 2.6755852842809364, "step": 2400}, {"loss": 1.565, "grad_norm": 0.4312658905982971, "learning_rate": 0.0002, "epoch": 2.6867335562987735, "step": 2410}, {"loss": 1.617, "grad_norm": 0.44714513421058655, "learning_rate": 0.0002, "epoch": 2.697881828316611, "step": 2420}, {"loss": 1.6185, "grad_norm": 0.49152931571006775, "learning_rate": 0.0002, "epoch": 2.709030100334448, "step": 2430}, {"loss": 1.5864, "grad_norm": 0.49458765983581543, "learning_rate": 0.0002, "epoch": 2.7201783723522857, "step": 2440}, {"loss": 1.6535, "grad_norm": 0.47838348150253296, "learning_rate": 0.0002, "epoch": 2.7313266443701227, "step": 2450}, {"loss": 1.6836, "grad_norm": 0.5781240463256836, "learning_rate": 0.0002, "epoch": 2.74247491638796, "step": 2460}, {"loss": 1.6141, "grad_norm": 0.4559851884841919, "learning_rate": 0.0002, "epoch": 2.753623188405797, "step": 2470}, {"loss": 1.5589, "grad_norm": 0.4452647566795349, "learning_rate": 0.0002, "epoch": 2.7647714604236344, "step": 2480}, {"loss": 1.6209, "grad_norm": 0.43920454382896423, "learning_rate": 0.0002, "epoch": 2.7759197324414715, "step": 2490}, {"loss": 1.5593, "grad_norm": 0.467780739068985, "learning_rate": 0.0002, "epoch": 2.787068004459309, "step": 2500}, {"loss": 1.6438, "grad_norm": 0.4743262529373169, "learning_rate": 0.0002, "epoch": 2.798216276477146, "step": 2510}, {"loss": 1.6084, "grad_norm": 0.47944432497024536, "learning_rate": 0.0002, "epoch": 2.809364548494983, "step": 2520}, {"loss": 1.6756, "grad_norm": 0.48032790422439575, "learning_rate": 0.0002, "epoch": 2.8205128205128203, "step": 2530}, {"loss": 1.6222, "grad_norm": 0.45569729804992676, "learning_rate": 0.0002, "epoch": 2.831661092530658, "step": 2540}, {"loss": 1.6187, "grad_norm": 0.47940587997436523, "learning_rate": 0.0002, "epoch": 2.842809364548495, "step": 2550}, {"loss": 1.6286, "grad_norm": 0.5215432047843933, "learning_rate": 0.0002, "epoch": 2.8539576365663324, "step": 2560}, {"loss": 1.6718, "grad_norm": 0.4421178102493286, "learning_rate": 0.0002, "epoch": 2.8651059085841695, "step": 2570}, {"loss": 1.6201, "grad_norm": 0.45288747549057007, "learning_rate": 0.0002, "epoch": 2.8762541806020065, "step": 2580}, {"loss": 1.5938, "grad_norm": 0.4472251832485199, "learning_rate": 0.0002, "epoch": 2.887402452619844, "step": 2590}, {"loss": 1.7212, "grad_norm": 0.4396503269672394, "learning_rate": 0.0002, "epoch": 2.898550724637681, "step": 2600}, {"loss": 1.6503, "grad_norm": 0.48590990900993347, "learning_rate": 0.0002, "epoch": 2.9096989966555182, "step": 2610}, {"loss": 1.5914, "grad_norm": 0.4787760376930237, "learning_rate": 0.0002, "epoch": 2.9208472686733558, "step": 2620}, {"loss": 1.717, "grad_norm": 0.4807611107826233, "learning_rate": 0.0002, "epoch": 2.931995540691193, "step": 2630}, {"loss": 1.6794, "grad_norm": 0.4625583291053772, "learning_rate": 0.0002, "epoch": 2.94314381270903, "step": 2640}, {"loss": 1.663, "grad_norm": 0.4163573980331421, "learning_rate": 0.0002, "epoch": 2.9542920847268674, "step": 2650}, {"loss": 1.6321, "grad_norm": 0.5142832398414612, "learning_rate": 0.0002, "epoch": 2.9654403567447045, "step": 2660}, {"loss": 1.6183, "grad_norm": 0.4459492564201355, "learning_rate": 0.0002, "epoch": 2.976588628762542, "step": 2670}, {"loss": 1.662, "grad_norm": 0.42905503511428833, "learning_rate": 0.0002, "epoch": 2.987736900780379, "step": 2680}, {"loss": 1.6796, "grad_norm": 0.44594648480415344, "learning_rate": 0.0002, "epoch": 2.998885172798216, "step": 2690}, {"eval_loss": 1.8300215005874634, "eval_runtime": 38.0349, "eval_samples_per_second": 13.54, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 2691}, {"loss": 1.5768, "grad_norm": 0.4742245078086853, "learning_rate": 0.0002, "epoch": 3.0100334448160537, "step": 2700}, {"loss": 1.4859, "grad_norm": 0.5157448649406433, "learning_rate": 0.0002, "epoch": 3.021181716833891, "step": 2710}, {"loss": 1.4219, "grad_norm": 0.5634726285934448, "learning_rate": 0.0002, "epoch": 3.032329988851728, "step": 2720}, {"loss": 1.5452, "grad_norm": 0.4554799199104309, "learning_rate": 0.0002, "epoch": 3.0434782608695654, "step": 2730}, {"loss": 1.4784, "grad_norm": 0.6565208435058594, "learning_rate": 0.0002, "epoch": 3.0546265328874025, "step": 2740}, {"loss": 1.459, "grad_norm": 0.6174370050430298, "learning_rate": 0.0002, "epoch": 3.0657748049052396, "step": 2750}, {"loss": 1.469, "grad_norm": 0.4987483024597168, "learning_rate": 0.0002, "epoch": 3.076923076923077, "step": 2760}, {"loss": 1.5466, "grad_norm": 0.5810927152633667, "learning_rate": 0.0002, "epoch": 3.088071348940914, "step": 2770}, {"loss": 1.4936, "grad_norm": 0.5281634330749512, "learning_rate": 0.0002, "epoch": 3.0992196209587513, "step": 2780}, {"loss": 1.4751, "grad_norm": 0.5479053854942322, "learning_rate": 0.0002, "epoch": 3.1103678929765888, "step": 2790}, {"loss": 1.5601, "grad_norm": 0.6192978620529175, "learning_rate": 0.0002, "epoch": 3.121516164994426, "step": 2800}, {"loss": 1.4888, "grad_norm": 0.560117781162262, "learning_rate": 0.0002, "epoch": 3.132664437012263, "step": 2810}, {"loss": 1.5495, "grad_norm": 0.6067224740982056, "learning_rate": 0.0002, "epoch": 3.1438127090301005, "step": 2820}, {"loss": 1.5239, "grad_norm": 0.611287534236908, "learning_rate": 0.0002, "epoch": 3.1549609810479375, "step": 2830}, {"loss": 1.4577, "grad_norm": 0.6441587209701538, "learning_rate": 0.0002, "epoch": 3.1661092530657746, "step": 2840}, {"loss": 1.5322, "grad_norm": 0.5955114364624023, "learning_rate": 0.0002, "epoch": 3.177257525083612, "step": 2850}, {"loss": 1.5222, "grad_norm": 0.5554782748222351, "learning_rate": 0.0002, "epoch": 3.1884057971014492, "step": 2860}, {"loss": 1.4676, "grad_norm": 0.5411370992660522, "learning_rate": 0.0002, "epoch": 3.1995540691192863, "step": 2870}, {"loss": 1.5008, "grad_norm": 0.6152016520500183, "learning_rate": 0.0002, "epoch": 3.210702341137124, "step": 2880}, {"loss": 1.5229, "grad_norm": 0.5711581110954285, "learning_rate": 0.0002, "epoch": 3.221850613154961, "step": 2890}, {"loss": 1.5255, "grad_norm": 0.5399307012557983, "learning_rate": 0.0002, "epoch": 3.2329988851727984, "step": 2900}, {"loss": 1.4888, "grad_norm": 0.60606849193573, "learning_rate": 0.0002, "epoch": 3.2441471571906355, "step": 2910}, {"loss": 1.5056, "grad_norm": 0.5873523950576782, "learning_rate": 0.0002, "epoch": 3.2552954292084726, "step": 2920}, {"loss": 1.5208, "grad_norm": 0.6149439215660095, "learning_rate": 0.0002, "epoch": 3.26644370122631, "step": 2930}, {"loss": 1.4942, "grad_norm": 0.5940659046173096, "learning_rate": 0.0002, "epoch": 3.277591973244147, "step": 2940}, {"loss": 1.5031, "grad_norm": 0.6846756339073181, "learning_rate": 0.0002, "epoch": 3.2887402452619843, "step": 2950}, {"loss": 1.5425, "grad_norm": 0.6708254218101501, "learning_rate": 0.0002, "epoch": 3.299888517279822, "step": 2960}, {"loss": 1.5319, "grad_norm": 0.5966503620147705, "learning_rate": 0.0002, "epoch": 3.311036789297659, "step": 2970}, {"loss": 1.5173, "grad_norm": 0.6328812837600708, "learning_rate": 0.0002, "epoch": 3.322185061315496, "step": 2980}, {"loss": 1.5096, "grad_norm": 0.6082745790481567, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2990}, {"loss": 1.5122, "grad_norm": 0.6207539439201355, "learning_rate": 0.0002, "epoch": 3.3444816053511706, "step": 3000}, {"loss": 1.5053, "grad_norm": 0.5501793026924133, "learning_rate": 0.0002, "epoch": 3.3556298773690076, "step": 3010}, {"loss": 1.4428, "grad_norm": 0.571275532245636, "learning_rate": 0.0002, "epoch": 3.366778149386845, "step": 3020}, {"loss": 1.5914, "grad_norm": 0.7003518342971802, "learning_rate": 0.0002, "epoch": 3.3779264214046822, "step": 3030}, {"loss": 1.5359, "grad_norm": 0.609527587890625, "learning_rate": 0.0002, "epoch": 3.3890746934225193, "step": 3040}, {"loss": 1.5072, "grad_norm": 0.5880036354064941, "learning_rate": 0.0002, "epoch": 3.400222965440357, "step": 3050}, {"loss": 1.5451, "grad_norm": 0.5847334265708923, "learning_rate": 0.0002, "epoch": 3.411371237458194, "step": 3060}, {"loss": 1.4738, "grad_norm": 0.5373924970626831, "learning_rate": 0.0002, "epoch": 3.4225195094760315, "step": 3070}, {"loss": 1.5215, "grad_norm": 0.6074833869934082, "learning_rate": 0.0002, "epoch": 3.4336677814938685, "step": 3080}, {"loss": 1.458, "grad_norm": 0.5118414163589478, "learning_rate": 0.0002, "epoch": 3.4448160535117056, "step": 3090}, {"loss": 1.5006, "grad_norm": 0.5577956438064575, "learning_rate": 0.0002, "epoch": 3.4559643255295427, "step": 3100}, {"loss": 1.5057, "grad_norm": 0.5654811859130859, "learning_rate": 0.0002, "epoch": 3.46711259754738, "step": 3110}, {"loss": 1.523, "grad_norm": 0.6216017603874207, "learning_rate": 0.0002, "epoch": 3.4782608695652173, "step": 3120}, {"loss": 1.5292, "grad_norm": 0.5983642339706421, "learning_rate": 0.0002, "epoch": 3.489409141583055, "step": 3130}, {"loss": 1.5568, "grad_norm": 0.6635708212852478, "learning_rate": 0.0002, "epoch": 3.500557413600892, "step": 3140}, {"loss": 1.4633, "grad_norm": 0.6254258751869202, "learning_rate": 0.0002, "epoch": 3.511705685618729, "step": 3150}, {"loss": 1.4934, "grad_norm": 0.6359851360321045, "learning_rate": 0.0002, "epoch": 3.522853957636566, "step": 3160}, {"loss": 1.4693, "grad_norm": 0.5938616394996643, "learning_rate": 0.0002, "epoch": 3.5340022296544036, "step": 3170}, {"loss": 1.4393, "grad_norm": 0.6360630393028259, "learning_rate": 0.0002, "epoch": 3.5451505016722407, "step": 3180}, {"loss": 1.5535, "grad_norm": 0.6097670197486877, "learning_rate": 0.0002, "epoch": 3.556298773690078, "step": 3190}, {"loss": 1.5427, "grad_norm": 0.5984025597572327, "learning_rate": 0.0002, "epoch": 3.5674470457079153, "step": 3200}, {"loss": 1.4741, "grad_norm": 0.5463748574256897, "learning_rate": 0.0002, "epoch": 3.5785953177257523, "step": 3210}, {"loss": 1.513, "grad_norm": 1.0017699003219604, "learning_rate": 0.0002, "epoch": 3.58974358974359, "step": 3220}, {"loss": 1.5687, "grad_norm": 0.6519441604614258, "learning_rate": 0.0002, "epoch": 3.600891861761427, "step": 3230}, {"loss": 1.5168, "grad_norm": 0.6457271575927734, "learning_rate": 0.0002, "epoch": 3.6120401337792645, "step": 3240}, {"loss": 1.5511, "grad_norm": 0.5898868441581726, "learning_rate": 0.0002, "epoch": 3.6231884057971016, "step": 3250}, {"loss": 1.5833, "grad_norm": 0.6612270474433899, "learning_rate": 0.0002, "epoch": 3.6343366778149386, "step": 3260}, {"loss": 1.4537, "grad_norm": 0.5102090239524841, "learning_rate": 0.0002, "epoch": 3.6454849498327757, "step": 3270}, {"loss": 1.4676, "grad_norm": 0.5357231497764587, "learning_rate": 0.0002, "epoch": 3.6566332218506132, "step": 3280}, {"loss": 1.5417, "grad_norm": 0.6176130175590515, "learning_rate": 0.0002, "epoch": 3.6677814938684503, "step": 3290}, {"loss": 1.5057, "grad_norm": 0.6384354829788208, "learning_rate": 0.0002, "epoch": 3.678929765886288, "step": 3300}, {"loss": 1.5973, "grad_norm": 0.5493269562721252, "learning_rate": 0.0002, "epoch": 3.690078037904125, "step": 3310}, {"loss": 1.5958, "grad_norm": 0.5721797943115234, "learning_rate": 0.0002, "epoch": 3.701226309921962, "step": 3320}, {"loss": 1.5098, "grad_norm": 0.6667633056640625, "learning_rate": 0.0002, "epoch": 3.712374581939799, "step": 3330}, {"loss": 1.5372, "grad_norm": 0.5713372826576233, "learning_rate": 0.0002, "epoch": 3.7235228539576366, "step": 3340}, {"loss": 1.5959, "grad_norm": 0.5925018191337585, "learning_rate": 0.0002, "epoch": 3.7346711259754737, "step": 3350}, {"loss": 1.5045, "grad_norm": 0.5660955905914307, "learning_rate": 0.0002, "epoch": 3.745819397993311, "step": 3360}, {"loss": 1.5465, "grad_norm": 0.5470759868621826, "learning_rate": 0.0002, "epoch": 3.7569676700111483, "step": 3370}, {"loss": 1.547, "grad_norm": 0.7612935900688171, "learning_rate": 0.0002, "epoch": 3.7681159420289854, "step": 3380}, {"loss": 1.6224, "grad_norm": 0.577467679977417, "learning_rate": 0.0002, "epoch": 3.779264214046823, "step": 3390}, {"loss": 1.5653, "grad_norm": 0.6125091910362244, "learning_rate": 0.0002, "epoch": 3.79041248606466, "step": 3400}, {"loss": 1.5463, "grad_norm": 0.590386152267456, "learning_rate": 0.0002, "epoch": 3.801560758082497, "step": 3410}, {"loss": 1.5944, "grad_norm": 0.5530361533164978, "learning_rate": 0.0002, "epoch": 3.8127090301003346, "step": 3420}, {"loss": 1.4797, "grad_norm": 0.5714079737663269, "learning_rate": 0.0002, "epoch": 3.8238573021181717, "step": 3430}, {"loss": 1.5324, "grad_norm": 0.9061086773872375, "learning_rate": 0.0002, "epoch": 3.8350055741360087, "step": 3440}, {"loss": 1.4513, "grad_norm": 0.6193320751190186, "learning_rate": 0.0002, "epoch": 3.8461538461538463, "step": 3450}, {"loss": 1.5537, "grad_norm": 0.5831704139709473, "learning_rate": 0.0002, "epoch": 3.8573021181716833, "step": 3460}, {"loss": 1.5144, "grad_norm": 0.5971192717552185, "learning_rate": 0.0002, "epoch": 3.868450390189521, "step": 3470}, {"loss": 1.484, "grad_norm": 0.6110154390335083, "learning_rate": 0.0002, "epoch": 3.879598662207358, "step": 3480}, {"loss": 1.5624, "grad_norm": 0.6644453406333923, "learning_rate": 0.0002, "epoch": 3.890746934225195, "step": 3490}, {"loss": 1.5422, "grad_norm": 0.6674908399581909, "learning_rate": 0.0002, "epoch": 3.901895206243032, "step": 3500}, {"loss": 1.579, "grad_norm": 0.5516519546508789, "learning_rate": 0.0002, "epoch": 3.9130434782608696, "step": 3510}, {"loss": 1.5964, "grad_norm": 0.6704319715499878, "learning_rate": 0.0002, "epoch": 3.9241917502787067, "step": 3520}, {"loss": 1.515, "grad_norm": 0.5820314288139343, "learning_rate": 0.0002, "epoch": 3.9353400222965442, "step": 3530}, {"loss": 1.6458, "grad_norm": 0.6931548714637756, "learning_rate": 0.0002, "epoch": 3.9464882943143813, "step": 3540}, {"loss": 1.5338, "grad_norm": 0.6085171103477478, "learning_rate": 0.0002, "epoch": 3.9576365663322184, "step": 3550}, {"loss": 1.5537, "grad_norm": 0.5973535776138306, "learning_rate": 0.0002, "epoch": 3.9687848383500555, "step": 3560}, {"loss": 1.5435, "grad_norm": 0.49761658906936646, "learning_rate": 0.0002, "epoch": 3.979933110367893, "step": 3570}, {"loss": 1.488, "grad_norm": 0.6282512545585632, "learning_rate": 0.0002, "epoch": 3.99108138238573, "step": 3580}]} +{"epoch": 5.0, "step": 4485, "epoch_duration": 1002.3510277271271, "total_accumulated_duration": 4923.015814304352, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}, {"eval_loss": 1.8143481016159058, "eval_runtime": 37.921, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 1.0, "step": 897}, {"loss": 1.8029, "grad_norm": 0.29757317900657654, "learning_rate": 0.0002, "epoch": 1.0033444816053512, "step": 900}, {"loss": 1.7376, "grad_norm": 0.32168492674827576, "learning_rate": 0.0002, "epoch": 1.0144927536231885, "step": 910}, {"loss": 1.6785, "grad_norm": 0.3430717885494232, "learning_rate": 0.0002, "epoch": 1.0256410256410255, "step": 920}, {"loss": 1.7356, "grad_norm": 0.3431745767593384, "learning_rate": 0.0002, "epoch": 1.0367892976588629, "step": 930}, {"loss": 1.7932, "grad_norm": 0.39787548780441284, "learning_rate": 0.0002, "epoch": 1.0479375696767002, "step": 940}, {"loss": 1.7434, "grad_norm": 0.3540935218334198, "learning_rate": 0.0002, "epoch": 1.0590858416945372, "step": 950}, {"loss": 1.7693, "grad_norm": 0.368484765291214, "learning_rate": 0.0002, "epoch": 1.0702341137123745, "step": 960}, {"loss": 1.6887, "grad_norm": 0.41324466466903687, "learning_rate": 0.0002, "epoch": 1.0813823857302118, "step": 970}, {"loss": 1.7288, "grad_norm": 0.3696419596672058, "learning_rate": 0.0002, "epoch": 1.0925306577480491, "step": 980}, {"loss": 1.7743, "grad_norm": 0.33832886815071106, "learning_rate": 0.0002, "epoch": 1.1036789297658862, "step": 990}, {"loss": 1.7445, "grad_norm": 0.4411991834640503, "learning_rate": 0.0002, "epoch": 1.1148272017837235, "step": 1000}, {"loss": 1.7699, "grad_norm": 0.3935333788394928, "learning_rate": 0.0002, "epoch": 1.1259754738015608, "step": 1010}, {"loss": 1.6909, "grad_norm": 0.32472893595695496, "learning_rate": 0.0002, "epoch": 1.137123745819398, "step": 1020}, {"loss": 1.6974, "grad_norm": 0.3455545902252197, "learning_rate": 0.0002, "epoch": 1.1482720178372352, "step": 1030}, {"loss": 1.7555, "grad_norm": 0.3995654582977295, "learning_rate": 0.0002, "epoch": 1.1594202898550725, "step": 1040}, {"loss": 1.7419, "grad_norm": 0.384056031703949, "learning_rate": 0.0002, "epoch": 1.1705685618729098, "step": 1050}, {"loss": 1.7693, "grad_norm": 0.4345705211162567, "learning_rate": 0.0002, "epoch": 1.1817168338907469, "step": 1060}, {"loss": 1.7219, "grad_norm": 0.3524057865142822, "learning_rate": 0.0002, "epoch": 1.1928651059085842, "step": 1070}, {"loss": 1.6701, "grad_norm": 0.4047132134437561, "learning_rate": 0.0002, "epoch": 1.2040133779264215, "step": 1080}, {"loss": 1.7035, "grad_norm": 0.365824431180954, "learning_rate": 0.0002, "epoch": 1.2151616499442586, "step": 1090}, {"loss": 1.7367, "grad_norm": 0.37048354744911194, "learning_rate": 0.0002, "epoch": 1.2263099219620959, "step": 1100}, {"loss": 1.7503, "grad_norm": 0.3753672242164612, "learning_rate": 0.0002, "epoch": 1.2374581939799332, "step": 1110}, {"loss": 1.6984, "grad_norm": 0.37887042760849, "learning_rate": 0.0002, "epoch": 1.2486064659977703, "step": 1120}, {"loss": 1.7866, "grad_norm": 0.3896579444408417, "learning_rate": 0.0002, "epoch": 1.2597547380156076, "step": 1130}, {"loss": 1.8085, "grad_norm": 0.3725394010543823, "learning_rate": 0.0002, "epoch": 1.2709030100334449, "step": 1140}, {"loss": 1.6942, "grad_norm": 0.373989999294281, "learning_rate": 0.0002, "epoch": 1.282051282051282, "step": 1150}, {"loss": 1.7566, "grad_norm": 0.4412260353565216, "learning_rate": 0.0002, "epoch": 1.2931995540691192, "step": 1160}, {"loss": 1.7425, "grad_norm": 0.38538658618927, "learning_rate": 0.0002, "epoch": 1.3043478260869565, "step": 1170}, {"loss": 1.6573, "grad_norm": 0.3644104599952698, "learning_rate": 0.0002, "epoch": 1.3154960981047936, "step": 1180}, {"loss": 1.6186, "grad_norm": 0.3615347743034363, "learning_rate": 0.0002, "epoch": 1.326644370122631, "step": 1190}, {"loss": 1.7575, "grad_norm": 0.4260489046573639, "learning_rate": 0.0002, "epoch": 1.3377926421404682, "step": 1200}, {"loss": 1.762, "grad_norm": 0.35236871242523193, "learning_rate": 0.0002, "epoch": 1.3489409141583055, "step": 1210}, {"loss": 1.7207, "grad_norm": 0.45456627011299133, "learning_rate": 0.0002, "epoch": 1.3600891861761428, "step": 1220}, {"loss": 1.7391, "grad_norm": 0.391541063785553, "learning_rate": 0.0002, "epoch": 1.37123745819398, "step": 1230}, {"loss": 1.7309, "grad_norm": 0.37955328822135925, "learning_rate": 0.0002, "epoch": 1.3823857302118172, "step": 1240}, {"loss": 1.7028, "grad_norm": 0.36955225467681885, "learning_rate": 0.0002, "epoch": 1.3935340022296545, "step": 1250}, {"loss": 1.7027, "grad_norm": 0.36156216263771057, "learning_rate": 0.0002, "epoch": 1.4046822742474916, "step": 1260}, {"loss": 1.8091, "grad_norm": 0.4083487391471863, "learning_rate": 0.0002, "epoch": 1.415830546265329, "step": 1270}, {"loss": 1.7551, "grad_norm": 0.420171320438385, "learning_rate": 0.0002, "epoch": 1.4269788182831662, "step": 1280}, {"loss": 1.7377, "grad_norm": 0.3581725060939789, "learning_rate": 0.0002, "epoch": 1.4381270903010033, "step": 1290}, {"loss": 1.728, "grad_norm": 0.3657953441143036, "learning_rate": 0.0002, "epoch": 1.4492753623188406, "step": 1300}, {"loss": 1.7116, "grad_norm": 0.3139931857585907, "learning_rate": 0.0002, "epoch": 1.4604236343366779, "step": 1310}, {"loss": 1.671, "grad_norm": 0.37750574946403503, "learning_rate": 0.0002, "epoch": 1.471571906354515, "step": 1320}, {"loss": 1.7663, "grad_norm": 0.37787437438964844, "learning_rate": 0.0002, "epoch": 1.4827201783723523, "step": 1330}, {"loss": 1.6403, "grad_norm": 0.39505279064178467, "learning_rate": 0.0002, "epoch": 1.4938684503901896, "step": 1340}, {"loss": 1.7745, "grad_norm": 0.39977672696113586, "learning_rate": 0.0002, "epoch": 1.5050167224080266, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.4395383298397064, "learning_rate": 0.0002, "epoch": 1.516164994425864, "step": 1360}, {"loss": 1.7315, "grad_norm": 0.3452998995780945, "learning_rate": 0.0002, "epoch": 1.5273132664437012, "step": 1370}, {"loss": 1.7244, "grad_norm": 0.39573904871940613, "learning_rate": 0.0002, "epoch": 1.5384615384615383, "step": 1380}, {"loss": 1.7453, "grad_norm": 0.4886358976364136, "learning_rate": 0.0002, "epoch": 1.5496098104793758, "step": 1390}, {"loss": 1.7294, "grad_norm": 0.35525891184806824, "learning_rate": 0.0002, "epoch": 1.560758082497213, "step": 1400}, {"loss": 1.6896, "grad_norm": 0.3873274028301239, "learning_rate": 0.0002, "epoch": 1.57190635451505, "step": 1410}, {"loss": 1.7545, "grad_norm": 0.35162487626075745, "learning_rate": 0.0002, "epoch": 1.5830546265328875, "step": 1420}, {"loss": 1.7403, "grad_norm": 0.3533175587654114, "learning_rate": 0.0002, "epoch": 1.5942028985507246, "step": 1430}, {"loss": 1.7199, "grad_norm": 0.35397887229919434, "learning_rate": 0.0002, "epoch": 1.605351170568562, "step": 1440}, {"loss": 1.701, "grad_norm": 0.3539091646671295, "learning_rate": 0.0002, "epoch": 1.6164994425863992, "step": 1450}, {"loss": 1.7407, "grad_norm": 0.38557013869285583, "learning_rate": 0.0002, "epoch": 1.6276477146042363, "step": 1460}, {"loss": 1.6896, "grad_norm": 0.3591409921646118, "learning_rate": 0.0002, "epoch": 1.6387959866220736, "step": 1470}, {"loss": 1.6831, "grad_norm": 0.3776722848415375, "learning_rate": 0.0002, "epoch": 1.649944258639911, "step": 1480}, {"loss": 1.7511, "grad_norm": 0.3761521875858307, "learning_rate": 0.0002, "epoch": 1.661092530657748, "step": 1490}, {"loss": 1.7464, "grad_norm": 0.33939364552497864, "learning_rate": 0.0002, "epoch": 1.6722408026755853, "step": 1500}, {"loss": 1.6522, "grad_norm": 0.3961067795753479, "learning_rate": 0.0002, "epoch": 1.6833890746934226, "step": 1510}, {"loss": 1.7849, "grad_norm": 0.36793094873428345, "learning_rate": 0.0002, "epoch": 1.6945373467112597, "step": 1520}, {"loss": 1.7057, "grad_norm": 0.4201025068759918, "learning_rate": 0.0002, "epoch": 1.705685618729097, "step": 1530}, {"loss": 1.6656, "grad_norm": 0.382280558347702, "learning_rate": 0.0002, "epoch": 1.7168338907469343, "step": 1540}, {"loss": 1.7987, "grad_norm": 0.4504372477531433, "learning_rate": 0.0002, "epoch": 1.7279821627647713, "step": 1550}, {"loss": 1.7889, "grad_norm": 0.36121585965156555, "learning_rate": 0.0002, "epoch": 1.7391304347826086, "step": 1560}, {"loss": 1.7282, "grad_norm": 0.38416755199432373, "learning_rate": 0.0002, "epoch": 1.750278706800446, "step": 1570}, {"loss": 1.7759, "grad_norm": 0.3920411467552185, "learning_rate": 0.0002, "epoch": 1.761426978818283, "step": 1580}, {"loss": 1.7693, "grad_norm": 0.4326777756214142, "learning_rate": 0.0002, "epoch": 1.7725752508361206, "step": 1590}, {"loss": 1.6804, "grad_norm": 0.3582489490509033, "learning_rate": 0.0002, "epoch": 1.7837235228539576, "step": 1600}, {"loss": 1.706, "grad_norm": 0.36345767974853516, "learning_rate": 0.0002, "epoch": 1.7948717948717947, "step": 1610}, {"loss": 1.75, "grad_norm": 0.3951990008354187, "learning_rate": 0.0002, "epoch": 1.8060200668896322, "step": 1620}, {"loss": 1.8034, "grad_norm": 0.35174235701560974, "learning_rate": 0.0002, "epoch": 1.8171683389074693, "step": 1630}, {"loss": 1.725, "grad_norm": 0.37005263566970825, "learning_rate": 0.0002, "epoch": 1.8283166109253066, "step": 1640}, {"loss": 1.695, "grad_norm": 0.42875173687934875, "learning_rate": 0.0002, "epoch": 1.839464882943144, "step": 1650}, {"loss": 1.7589, "grad_norm": 0.3646032512187958, "learning_rate": 0.0002, "epoch": 1.850613154960981, "step": 1660}, {"loss": 1.6698, "grad_norm": 0.38111618161201477, "learning_rate": 0.0002, "epoch": 1.8617614269788183, "step": 1670}, {"loss": 1.7832, "grad_norm": 0.3825555443763733, "learning_rate": 0.0002, "epoch": 1.8729096989966556, "step": 1680}, {"loss": 1.7599, "grad_norm": 0.36418095231056213, "learning_rate": 0.0002, "epoch": 1.8840579710144927, "step": 1690}, {"loss": 1.6532, "grad_norm": 0.36551007628440857, "learning_rate": 0.0002, "epoch": 1.89520624303233, "step": 1700}, {"loss": 1.7174, "grad_norm": 0.36421480774879456, "learning_rate": 0.0002, "epoch": 1.9063545150501673, "step": 1710}, {"loss": 1.7176, "grad_norm": 0.3791242241859436, "learning_rate": 0.0002, "epoch": 1.9175027870680044, "step": 1720}, {"loss": 1.7961, "grad_norm": 0.36655193567276, "learning_rate": 0.0002, "epoch": 1.9286510590858417, "step": 1730}, {"loss": 1.7765, "grad_norm": 0.3526945412158966, "learning_rate": 0.0002, "epoch": 1.939799331103679, "step": 1740}, {"loss": 1.7047, "grad_norm": 0.41139861941337585, "learning_rate": 0.0002, "epoch": 1.950947603121516, "step": 1750}, {"loss": 1.8155, "grad_norm": 0.41757065057754517, "learning_rate": 0.0002, "epoch": 1.9620958751393534, "step": 1760}, {"loss": 1.7271, "grad_norm": 0.38956186175346375, "learning_rate": 0.0002, "epoch": 1.9732441471571907, "step": 1770}, {"loss": 1.7653, "grad_norm": 0.33891627192497253, "learning_rate": 0.0002, "epoch": 1.9843924191750277, "step": 1780}, {"loss": 1.7305, "grad_norm": 0.42879191040992737, "learning_rate": 0.0002, "epoch": 1.9955406911928653, "step": 1790}, {"eval_loss": 1.8116765022277832, "eval_runtime": 37.9859, "eval_samples_per_second": 13.558, "eval_steps_per_second": 1.711, "epoch": 2.0, "step": 1794}, {"loss": 1.6724, "grad_norm": 0.42103368043899536, "learning_rate": 0.0002, "epoch": 2.0066889632107023, "step": 1800}, {"loss": 1.5812, "grad_norm": 0.41505053639411926, "learning_rate": 0.0002, "epoch": 2.0178372352285394, "step": 1810}, {"loss": 1.6132, "grad_norm": 0.398190438747406, "learning_rate": 0.0002, "epoch": 2.028985507246377, "step": 1820}, {"loss": 1.6497, "grad_norm": 0.4371621310710907, "learning_rate": 0.0002, "epoch": 2.040133779264214, "step": 1830}, {"loss": 1.6501, "grad_norm": 0.45679208636283875, "learning_rate": 0.0002, "epoch": 2.051282051282051, "step": 1840}, {"loss": 1.5773, "grad_norm": 0.43211811780929565, "learning_rate": 0.0002, "epoch": 2.0624303232998886, "step": 1850}, {"loss": 1.6414, "grad_norm": 0.47492915391921997, "learning_rate": 0.0002, "epoch": 2.0735785953177257, "step": 1860}, {"loss": 1.7169, "grad_norm": 0.41742339730262756, "learning_rate": 0.0002, "epoch": 2.084726867335563, "step": 1870}, {"loss": 1.5762, "grad_norm": 0.45789217948913574, "learning_rate": 0.0002, "epoch": 2.0958751393534003, "step": 1880}, {"loss": 1.6896, "grad_norm": 0.43958935141563416, "learning_rate": 0.0002, "epoch": 2.1070234113712374, "step": 1890}, {"loss": 1.6444, "grad_norm": 0.43991968035697937, "learning_rate": 0.0002, "epoch": 2.1181716833890745, "step": 1900}, {"loss": 1.6057, "grad_norm": 0.4667953848838806, "learning_rate": 0.0002, "epoch": 2.129319955406912, "step": 1910}, {"loss": 1.5999, "grad_norm": 0.42225760221481323, "learning_rate": 0.0002, "epoch": 2.140468227424749, "step": 1920}, {"loss": 1.6525, "grad_norm": 0.418850839138031, "learning_rate": 0.0002, "epoch": 2.1516164994425866, "step": 1930}, {"loss": 1.6091, "grad_norm": 0.43838515877723694, "learning_rate": 0.0002, "epoch": 2.1627647714604237, "step": 1940}, {"loss": 1.6837, "grad_norm": 0.43798115849494934, "learning_rate": 0.0002, "epoch": 2.1739130434782608, "step": 1950}, {"loss": 1.632, "grad_norm": 0.4456610679626465, "learning_rate": 0.0002, "epoch": 2.1850613154960983, "step": 1960}, {"loss": 1.6338, "grad_norm": 0.4619026482105255, "learning_rate": 0.0002, "epoch": 2.1962095875139354, "step": 1970}, {"loss": 1.6989, "grad_norm": 0.4732453525066376, "learning_rate": 0.0002, "epoch": 2.2073578595317724, "step": 1980}, {"loss": 1.581, "grad_norm": 0.42551836371421814, "learning_rate": 0.0002, "epoch": 2.21850613154961, "step": 1990}, {"loss": 1.6386, "grad_norm": 0.45154353976249695, "learning_rate": 0.0002, "epoch": 2.229654403567447, "step": 2000}, {"loss": 1.6768, "grad_norm": 0.4655696451663971, "learning_rate": 0.0002, "epoch": 2.240802675585284, "step": 2010}, {"loss": 1.6972, "grad_norm": 0.5363447666168213, "learning_rate": 0.0002, "epoch": 2.2519509476031216, "step": 2020}, {"loss": 1.6561, "grad_norm": 0.4839927852153778, "learning_rate": 0.0002, "epoch": 2.2630992196209587, "step": 2030}, {"loss": 1.6838, "grad_norm": 0.4639221727848053, "learning_rate": 0.0002, "epoch": 2.274247491638796, "step": 2040}, {"loss": 1.6063, "grad_norm": 0.46169278025627136, "learning_rate": 0.0002, "epoch": 2.2853957636566333, "step": 2050}, {"loss": 1.5924, "grad_norm": 0.4582304060459137, "learning_rate": 0.0002, "epoch": 2.2965440356744704, "step": 2060}, {"loss": 1.5778, "grad_norm": 0.48619818687438965, "learning_rate": 0.0002, "epoch": 2.3076923076923075, "step": 2070}, {"loss": 1.633, "grad_norm": 0.4382200241088867, "learning_rate": 0.0002, "epoch": 2.318840579710145, "step": 2080}, {"loss": 1.5854, "grad_norm": 0.4103265106678009, "learning_rate": 0.0002, "epoch": 2.329988851727982, "step": 2090}, {"loss": 1.7042, "grad_norm": 0.5136023759841919, "learning_rate": 0.0002, "epoch": 2.3411371237458196, "step": 2100}, {"loss": 1.5723, "grad_norm": 0.46723702549934387, "learning_rate": 0.0002, "epoch": 2.3522853957636567, "step": 2110}, {"loss": 1.6852, "grad_norm": 0.42269468307495117, "learning_rate": 0.0002, "epoch": 2.3634336677814938, "step": 2120}, {"loss": 1.6369, "grad_norm": 0.42611163854599, "learning_rate": 0.0002, "epoch": 2.374581939799331, "step": 2130}, {"loss": 1.5879, "grad_norm": 0.4573901891708374, "learning_rate": 0.0002, "epoch": 2.3857302118171684, "step": 2140}, {"loss": 1.6317, "grad_norm": 0.4758673310279846, "learning_rate": 0.0002, "epoch": 2.3968784838350055, "step": 2150}, {"loss": 1.6527, "grad_norm": 0.49616846442222595, "learning_rate": 0.0002, "epoch": 2.408026755852843, "step": 2160}, {"loss": 1.5796, "grad_norm": 0.5278240442276001, "learning_rate": 0.0002, "epoch": 2.41917502787068, "step": 2170}, {"loss": 1.6746, "grad_norm": 0.46806028485298157, "learning_rate": 0.0002, "epoch": 2.430323299888517, "step": 2180}, {"loss": 1.676, "grad_norm": 0.44507312774658203, "learning_rate": 0.0002, "epoch": 2.4414715719063547, "step": 2190}, {"loss": 1.6793, "grad_norm": 0.45716050267219543, "learning_rate": 0.0002, "epoch": 2.4526198439241917, "step": 2200}, {"loss": 1.6198, "grad_norm": 0.4226573705673218, "learning_rate": 0.0002, "epoch": 2.463768115942029, "step": 2210}, {"loss": 1.5721, "grad_norm": 0.4488418400287628, "learning_rate": 0.0002, "epoch": 2.4749163879598663, "step": 2220}, {"loss": 1.6399, "grad_norm": 0.48324450850486755, "learning_rate": 0.0002, "epoch": 2.4860646599777034, "step": 2230}, {"loss": 1.6228, "grad_norm": 0.4866982400417328, "learning_rate": 0.0002, "epoch": 2.4972129319955405, "step": 2240}, {"loss": 1.6887, "grad_norm": 0.4784172773361206, "learning_rate": 0.0002, "epoch": 2.508361204013378, "step": 2250}, {"loss": 1.6905, "grad_norm": 0.4250621199607849, "learning_rate": 0.0002, "epoch": 2.519509476031215, "step": 2260}, {"loss": 1.6582, "grad_norm": 0.431224524974823, "learning_rate": 0.0002, "epoch": 2.5306577480490526, "step": 2270}, {"loss": 1.5981, "grad_norm": 0.3931371867656708, "learning_rate": 0.0002, "epoch": 2.5418060200668897, "step": 2280}, {"loss": 1.6897, "grad_norm": 0.4800887703895569, "learning_rate": 0.0002, "epoch": 2.552954292084727, "step": 2290}, {"loss": 1.6205, "grad_norm": 0.4288487136363983, "learning_rate": 0.0002, "epoch": 2.564102564102564, "step": 2300}, {"loss": 1.6005, "grad_norm": 0.48489660024642944, "learning_rate": 0.0002, "epoch": 2.5752508361204014, "step": 2310}, {"loss": 1.6447, "grad_norm": 0.4221740961074829, "learning_rate": 0.0002, "epoch": 2.5863991081382385, "step": 2320}, {"loss": 1.666, "grad_norm": 0.4413852393627167, "learning_rate": 0.0002, "epoch": 2.597547380156076, "step": 2330}, {"loss": 1.6863, "grad_norm": 0.4391345679759979, "learning_rate": 0.0002, "epoch": 2.608695652173913, "step": 2340}, {"loss": 1.6942, "grad_norm": 0.4824720323085785, "learning_rate": 0.0002, "epoch": 2.61984392419175, "step": 2350}, {"loss": 1.5615, "grad_norm": 0.4023158550262451, "learning_rate": 0.0002, "epoch": 2.6309921962095872, "step": 2360}, {"loss": 1.698, "grad_norm": 0.5107841491699219, "learning_rate": 0.0002, "epoch": 2.6421404682274248, "step": 2370}, {"loss": 1.6258, "grad_norm": 0.4705312252044678, "learning_rate": 0.0002, "epoch": 2.653288740245262, "step": 2380}, {"loss": 1.7294, "grad_norm": 0.4420899450778961, "learning_rate": 0.0002, "epoch": 2.6644370122630994, "step": 2390}, {"loss": 1.6246, "grad_norm": 0.413308709859848, "learning_rate": 0.0002, "epoch": 2.6755852842809364, "step": 2400}, {"loss": 1.565, "grad_norm": 0.4312658905982971, "learning_rate": 0.0002, "epoch": 2.6867335562987735, "step": 2410}, {"loss": 1.617, "grad_norm": 0.44714513421058655, "learning_rate": 0.0002, "epoch": 2.697881828316611, "step": 2420}, {"loss": 1.6185, "grad_norm": 0.49152931571006775, "learning_rate": 0.0002, "epoch": 2.709030100334448, "step": 2430}, {"loss": 1.5864, "grad_norm": 0.49458765983581543, "learning_rate": 0.0002, "epoch": 2.7201783723522857, "step": 2440}, {"loss": 1.6535, "grad_norm": 0.47838348150253296, "learning_rate": 0.0002, "epoch": 2.7313266443701227, "step": 2450}, {"loss": 1.6836, "grad_norm": 0.5781240463256836, "learning_rate": 0.0002, "epoch": 2.74247491638796, "step": 2460}, {"loss": 1.6141, "grad_norm": 0.4559851884841919, "learning_rate": 0.0002, "epoch": 2.753623188405797, "step": 2470}, {"loss": 1.5589, "grad_norm": 0.4452647566795349, "learning_rate": 0.0002, "epoch": 2.7647714604236344, "step": 2480}, {"loss": 1.6209, "grad_norm": 0.43920454382896423, "learning_rate": 0.0002, "epoch": 2.7759197324414715, "step": 2490}, {"loss": 1.5593, "grad_norm": 0.467780739068985, "learning_rate": 0.0002, "epoch": 2.787068004459309, "step": 2500}, {"loss": 1.6438, "grad_norm": 0.4743262529373169, "learning_rate": 0.0002, "epoch": 2.798216276477146, "step": 2510}, {"loss": 1.6084, "grad_norm": 0.47944432497024536, "learning_rate": 0.0002, "epoch": 2.809364548494983, "step": 2520}, {"loss": 1.6756, "grad_norm": 0.48032790422439575, "learning_rate": 0.0002, "epoch": 2.8205128205128203, "step": 2530}, {"loss": 1.6222, "grad_norm": 0.45569729804992676, "learning_rate": 0.0002, "epoch": 2.831661092530658, "step": 2540}, {"loss": 1.6187, "grad_norm": 0.47940587997436523, "learning_rate": 0.0002, "epoch": 2.842809364548495, "step": 2550}, {"loss": 1.6286, "grad_norm": 0.5215432047843933, "learning_rate": 0.0002, "epoch": 2.8539576365663324, "step": 2560}, {"loss": 1.6718, "grad_norm": 0.4421178102493286, "learning_rate": 0.0002, "epoch": 2.8651059085841695, "step": 2570}, {"loss": 1.6201, "grad_norm": 0.45288747549057007, "learning_rate": 0.0002, "epoch": 2.8762541806020065, "step": 2580}, {"loss": 1.5938, "grad_norm": 0.4472251832485199, "learning_rate": 0.0002, "epoch": 2.887402452619844, "step": 2590}, {"loss": 1.7212, "grad_norm": 0.4396503269672394, "learning_rate": 0.0002, "epoch": 2.898550724637681, "step": 2600}, {"loss": 1.6503, "grad_norm": 0.48590990900993347, "learning_rate": 0.0002, "epoch": 2.9096989966555182, "step": 2610}, {"loss": 1.5914, "grad_norm": 0.4787760376930237, "learning_rate": 0.0002, "epoch": 2.9208472686733558, "step": 2620}, {"loss": 1.717, "grad_norm": 0.4807611107826233, "learning_rate": 0.0002, "epoch": 2.931995540691193, "step": 2630}, {"loss": 1.6794, "grad_norm": 0.4625583291053772, "learning_rate": 0.0002, "epoch": 2.94314381270903, "step": 2640}, {"loss": 1.663, "grad_norm": 0.4163573980331421, "learning_rate": 0.0002, "epoch": 2.9542920847268674, "step": 2650}, {"loss": 1.6321, "grad_norm": 0.5142832398414612, "learning_rate": 0.0002, "epoch": 2.9654403567447045, "step": 2660}, {"loss": 1.6183, "grad_norm": 0.4459492564201355, "learning_rate": 0.0002, "epoch": 2.976588628762542, "step": 2670}, {"loss": 1.662, "grad_norm": 0.42905503511428833, "learning_rate": 0.0002, "epoch": 2.987736900780379, "step": 2680}, {"loss": 1.6796, "grad_norm": 0.44594648480415344, "learning_rate": 0.0002, "epoch": 2.998885172798216, "step": 2690}, {"eval_loss": 1.8300215005874634, "eval_runtime": 38.0349, "eval_samples_per_second": 13.54, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 2691}, {"loss": 1.5768, "grad_norm": 0.4742245078086853, "learning_rate": 0.0002, "epoch": 3.0100334448160537, "step": 2700}, {"loss": 1.4859, "grad_norm": 0.5157448649406433, "learning_rate": 0.0002, "epoch": 3.021181716833891, "step": 2710}, {"loss": 1.4219, "grad_norm": 0.5634726285934448, "learning_rate": 0.0002, "epoch": 3.032329988851728, "step": 2720}, {"loss": 1.5452, "grad_norm": 0.4554799199104309, "learning_rate": 0.0002, "epoch": 3.0434782608695654, "step": 2730}, {"loss": 1.4784, "grad_norm": 0.6565208435058594, "learning_rate": 0.0002, "epoch": 3.0546265328874025, "step": 2740}, {"loss": 1.459, "grad_norm": 0.6174370050430298, "learning_rate": 0.0002, "epoch": 3.0657748049052396, "step": 2750}, {"loss": 1.469, "grad_norm": 0.4987483024597168, "learning_rate": 0.0002, "epoch": 3.076923076923077, "step": 2760}, {"loss": 1.5466, "grad_norm": 0.5810927152633667, "learning_rate": 0.0002, "epoch": 3.088071348940914, "step": 2770}, {"loss": 1.4936, "grad_norm": 0.5281634330749512, "learning_rate": 0.0002, "epoch": 3.0992196209587513, "step": 2780}, {"loss": 1.4751, "grad_norm": 0.5479053854942322, "learning_rate": 0.0002, "epoch": 3.1103678929765888, "step": 2790}, {"loss": 1.5601, "grad_norm": 0.6192978620529175, "learning_rate": 0.0002, "epoch": 3.121516164994426, "step": 2800}, {"loss": 1.4888, "grad_norm": 0.560117781162262, "learning_rate": 0.0002, "epoch": 3.132664437012263, "step": 2810}, {"loss": 1.5495, "grad_norm": 0.6067224740982056, "learning_rate": 0.0002, "epoch": 3.1438127090301005, "step": 2820}, {"loss": 1.5239, "grad_norm": 0.611287534236908, "learning_rate": 0.0002, "epoch": 3.1549609810479375, "step": 2830}, {"loss": 1.4577, "grad_norm": 0.6441587209701538, "learning_rate": 0.0002, "epoch": 3.1661092530657746, "step": 2840}, {"loss": 1.5322, "grad_norm": 0.5955114364624023, "learning_rate": 0.0002, "epoch": 3.177257525083612, "step": 2850}, {"loss": 1.5222, "grad_norm": 0.5554782748222351, "learning_rate": 0.0002, "epoch": 3.1884057971014492, "step": 2860}, {"loss": 1.4676, "grad_norm": 0.5411370992660522, "learning_rate": 0.0002, "epoch": 3.1995540691192863, "step": 2870}, {"loss": 1.5008, "grad_norm": 0.6152016520500183, "learning_rate": 0.0002, "epoch": 3.210702341137124, "step": 2880}, {"loss": 1.5229, "grad_norm": 0.5711581110954285, "learning_rate": 0.0002, "epoch": 3.221850613154961, "step": 2890}, {"loss": 1.5255, "grad_norm": 0.5399307012557983, "learning_rate": 0.0002, "epoch": 3.2329988851727984, "step": 2900}, {"loss": 1.4888, "grad_norm": 0.60606849193573, "learning_rate": 0.0002, "epoch": 3.2441471571906355, "step": 2910}, {"loss": 1.5056, "grad_norm": 0.5873523950576782, "learning_rate": 0.0002, "epoch": 3.2552954292084726, "step": 2920}, {"loss": 1.5208, "grad_norm": 0.6149439215660095, "learning_rate": 0.0002, "epoch": 3.26644370122631, "step": 2930}, {"loss": 1.4942, "grad_norm": 0.5940659046173096, "learning_rate": 0.0002, "epoch": 3.277591973244147, "step": 2940}, {"loss": 1.5031, "grad_norm": 0.6846756339073181, "learning_rate": 0.0002, "epoch": 3.2887402452619843, "step": 2950}, {"loss": 1.5425, "grad_norm": 0.6708254218101501, "learning_rate": 0.0002, "epoch": 3.299888517279822, "step": 2960}, {"loss": 1.5319, "grad_norm": 0.5966503620147705, "learning_rate": 0.0002, "epoch": 3.311036789297659, "step": 2970}, {"loss": 1.5173, "grad_norm": 0.6328812837600708, "learning_rate": 0.0002, "epoch": 3.322185061315496, "step": 2980}, {"loss": 1.5096, "grad_norm": 0.6082745790481567, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2990}, {"loss": 1.5122, "grad_norm": 0.6207539439201355, "learning_rate": 0.0002, "epoch": 3.3444816053511706, "step": 3000}, {"loss": 1.5053, "grad_norm": 0.5501793026924133, "learning_rate": 0.0002, "epoch": 3.3556298773690076, "step": 3010}, {"loss": 1.4428, "grad_norm": 0.571275532245636, "learning_rate": 0.0002, "epoch": 3.366778149386845, "step": 3020}, {"loss": 1.5914, "grad_norm": 0.7003518342971802, "learning_rate": 0.0002, "epoch": 3.3779264214046822, "step": 3030}, {"loss": 1.5359, "grad_norm": 0.609527587890625, "learning_rate": 0.0002, "epoch": 3.3890746934225193, "step": 3040}, {"loss": 1.5072, "grad_norm": 0.5880036354064941, "learning_rate": 0.0002, "epoch": 3.400222965440357, "step": 3050}, {"loss": 1.5451, "grad_norm": 0.5847334265708923, "learning_rate": 0.0002, "epoch": 3.411371237458194, "step": 3060}, {"loss": 1.4738, "grad_norm": 0.5373924970626831, "learning_rate": 0.0002, "epoch": 3.4225195094760315, "step": 3070}, {"loss": 1.5215, "grad_norm": 0.6074833869934082, "learning_rate": 0.0002, "epoch": 3.4336677814938685, "step": 3080}, {"loss": 1.458, "grad_norm": 0.5118414163589478, "learning_rate": 0.0002, "epoch": 3.4448160535117056, "step": 3090}, {"loss": 1.5006, "grad_norm": 0.5577956438064575, "learning_rate": 0.0002, "epoch": 3.4559643255295427, "step": 3100}, {"loss": 1.5057, "grad_norm": 0.5654811859130859, "learning_rate": 0.0002, "epoch": 3.46711259754738, "step": 3110}, {"loss": 1.523, "grad_norm": 0.6216017603874207, "learning_rate": 0.0002, "epoch": 3.4782608695652173, "step": 3120}, {"loss": 1.5292, "grad_norm": 0.5983642339706421, "learning_rate": 0.0002, "epoch": 3.489409141583055, "step": 3130}, {"loss": 1.5568, "grad_norm": 0.6635708212852478, "learning_rate": 0.0002, "epoch": 3.500557413600892, "step": 3140}, {"loss": 1.4633, "grad_norm": 0.6254258751869202, "learning_rate": 0.0002, "epoch": 3.511705685618729, "step": 3150}, {"loss": 1.4934, "grad_norm": 0.6359851360321045, "learning_rate": 0.0002, "epoch": 3.522853957636566, "step": 3160}, {"loss": 1.4693, "grad_norm": 0.5938616394996643, "learning_rate": 0.0002, "epoch": 3.5340022296544036, "step": 3170}, {"loss": 1.4393, "grad_norm": 0.6360630393028259, "learning_rate": 0.0002, "epoch": 3.5451505016722407, "step": 3180}, {"loss": 1.5535, "grad_norm": 0.6097670197486877, "learning_rate": 0.0002, "epoch": 3.556298773690078, "step": 3190}, {"loss": 1.5427, "grad_norm": 0.5984025597572327, "learning_rate": 0.0002, "epoch": 3.5674470457079153, "step": 3200}, {"loss": 1.4741, "grad_norm": 0.5463748574256897, "learning_rate": 0.0002, "epoch": 3.5785953177257523, "step": 3210}, {"loss": 1.513, "grad_norm": 1.0017699003219604, "learning_rate": 0.0002, "epoch": 3.58974358974359, "step": 3220}, {"loss": 1.5687, "grad_norm": 0.6519441604614258, "learning_rate": 0.0002, "epoch": 3.600891861761427, "step": 3230}, {"loss": 1.5168, "grad_norm": 0.6457271575927734, "learning_rate": 0.0002, "epoch": 3.6120401337792645, "step": 3240}, {"loss": 1.5511, "grad_norm": 0.5898868441581726, "learning_rate": 0.0002, "epoch": 3.6231884057971016, "step": 3250}, {"loss": 1.5833, "grad_norm": 0.6612270474433899, "learning_rate": 0.0002, "epoch": 3.6343366778149386, "step": 3260}, {"loss": 1.4537, "grad_norm": 0.5102090239524841, "learning_rate": 0.0002, "epoch": 3.6454849498327757, "step": 3270}, {"loss": 1.4676, "grad_norm": 0.5357231497764587, "learning_rate": 0.0002, "epoch": 3.6566332218506132, "step": 3280}, {"loss": 1.5417, "grad_norm": 0.6176130175590515, "learning_rate": 0.0002, "epoch": 3.6677814938684503, "step": 3290}, {"loss": 1.5057, "grad_norm": 0.6384354829788208, "learning_rate": 0.0002, "epoch": 3.678929765886288, "step": 3300}, {"loss": 1.5973, "grad_norm": 0.5493269562721252, "learning_rate": 0.0002, "epoch": 3.690078037904125, "step": 3310}, {"loss": 1.5958, "grad_norm": 0.5721797943115234, "learning_rate": 0.0002, "epoch": 3.701226309921962, "step": 3320}, {"loss": 1.5098, "grad_norm": 0.6667633056640625, "learning_rate": 0.0002, "epoch": 3.712374581939799, "step": 3330}, {"loss": 1.5372, "grad_norm": 0.5713372826576233, "learning_rate": 0.0002, "epoch": 3.7235228539576366, "step": 3340}, {"loss": 1.5959, "grad_norm": 0.5925018191337585, "learning_rate": 0.0002, "epoch": 3.7346711259754737, "step": 3350}, {"loss": 1.5045, "grad_norm": 0.5660955905914307, "learning_rate": 0.0002, "epoch": 3.745819397993311, "step": 3360}, {"loss": 1.5465, "grad_norm": 0.5470759868621826, "learning_rate": 0.0002, "epoch": 3.7569676700111483, "step": 3370}, {"loss": 1.547, "grad_norm": 0.7612935900688171, "learning_rate": 0.0002, "epoch": 3.7681159420289854, "step": 3380}, {"loss": 1.6224, "grad_norm": 0.577467679977417, "learning_rate": 0.0002, "epoch": 3.779264214046823, "step": 3390}, {"loss": 1.5653, "grad_norm": 0.6125091910362244, "learning_rate": 0.0002, "epoch": 3.79041248606466, "step": 3400}, {"loss": 1.5463, "grad_norm": 0.590386152267456, "learning_rate": 0.0002, "epoch": 3.801560758082497, "step": 3410}, {"loss": 1.5944, "grad_norm": 0.5530361533164978, "learning_rate": 0.0002, "epoch": 3.8127090301003346, "step": 3420}, {"loss": 1.4797, "grad_norm": 0.5714079737663269, "learning_rate": 0.0002, "epoch": 3.8238573021181717, "step": 3430}, {"loss": 1.5324, "grad_norm": 0.9061086773872375, "learning_rate": 0.0002, "epoch": 3.8350055741360087, "step": 3440}, {"loss": 1.4513, "grad_norm": 0.6193320751190186, "learning_rate": 0.0002, "epoch": 3.8461538461538463, "step": 3450}, {"loss": 1.5537, "grad_norm": 0.5831704139709473, "learning_rate": 0.0002, "epoch": 3.8573021181716833, "step": 3460}, {"loss": 1.5144, "grad_norm": 0.5971192717552185, "learning_rate": 0.0002, "epoch": 3.868450390189521, "step": 3470}, {"loss": 1.484, "grad_norm": 0.6110154390335083, "learning_rate": 0.0002, "epoch": 3.879598662207358, "step": 3480}, {"loss": 1.5624, "grad_norm": 0.6644453406333923, "learning_rate": 0.0002, "epoch": 3.890746934225195, "step": 3490}, {"loss": 1.5422, "grad_norm": 0.6674908399581909, "learning_rate": 0.0002, "epoch": 3.901895206243032, "step": 3500}, {"loss": 1.579, "grad_norm": 0.5516519546508789, "learning_rate": 0.0002, "epoch": 3.9130434782608696, "step": 3510}, {"loss": 1.5964, "grad_norm": 0.6704319715499878, "learning_rate": 0.0002, "epoch": 3.9241917502787067, "step": 3520}, {"loss": 1.515, "grad_norm": 0.5820314288139343, "learning_rate": 0.0002, "epoch": 3.9353400222965442, "step": 3530}, {"loss": 1.6458, "grad_norm": 0.6931548714637756, "learning_rate": 0.0002, "epoch": 3.9464882943143813, "step": 3540}, {"loss": 1.5338, "grad_norm": 0.6085171103477478, "learning_rate": 0.0002, "epoch": 3.9576365663322184, "step": 3550}, {"loss": 1.5537, "grad_norm": 0.5973535776138306, "learning_rate": 0.0002, "epoch": 3.9687848383500555, "step": 3560}, {"loss": 1.5435, "grad_norm": 0.49761658906936646, "learning_rate": 0.0002, "epoch": 3.979933110367893, "step": 3570}, {"loss": 1.488, "grad_norm": 0.6282512545585632, "learning_rate": 0.0002, "epoch": 3.99108138238573, "step": 3580}, {"eval_loss": 1.8790398836135864, "eval_runtime": 37.9725, "eval_samples_per_second": 13.562, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 3588}, {"loss": 1.5025, "grad_norm": 0.6402973532676697, "learning_rate": 0.0002, "epoch": 4.002229654403568, "step": 3590}, {"loss": 1.3695, "grad_norm": 0.7791030406951904, "learning_rate": 0.0002, "epoch": 4.013377926421405, "step": 3600}, {"loss": 1.3545, "grad_norm": 0.7136624455451965, "learning_rate": 0.0002, "epoch": 4.024526198439242, "step": 3610}, {"loss": 1.3515, "grad_norm": 0.7608486413955688, "learning_rate": 0.0002, "epoch": 4.035674470457079, "step": 3620}, {"loss": 1.3067, "grad_norm": 0.7486591935157776, "learning_rate": 0.0002, "epoch": 4.046822742474917, "step": 3630}, {"loss": 1.3474, "grad_norm": 0.7576302289962769, "learning_rate": 0.0002, "epoch": 4.057971014492754, "step": 3640}, {"loss": 1.3036, "grad_norm": 0.7358254194259644, "learning_rate": 0.0002, "epoch": 4.069119286510591, "step": 3650}, {"loss": 1.3015, "grad_norm": 0.821326494216919, "learning_rate": 0.0002, "epoch": 4.080267558528428, "step": 3660}, {"loss": 1.4186, "grad_norm": 0.7996482253074646, "learning_rate": 0.0002, "epoch": 4.091415830546265, "step": 3670}, {"loss": 1.3671, "grad_norm": 0.8527022004127502, "learning_rate": 0.0002, "epoch": 4.102564102564102, "step": 3680}, {"loss": 1.3818, "grad_norm": 0.7313576340675354, "learning_rate": 0.0002, "epoch": 4.11371237458194, "step": 3690}, {"loss": 1.3307, "grad_norm": 0.7854588627815247, "learning_rate": 0.0002, "epoch": 4.124860646599777, "step": 3700}, {"loss": 1.4174, "grad_norm": 0.6588303446769714, "learning_rate": 0.0002, "epoch": 4.136008918617614, "step": 3710}, {"loss": 1.3674, "grad_norm": 0.7986254692077637, "learning_rate": 0.0002, "epoch": 4.147157190635451, "step": 3720}, {"loss": 1.3505, "grad_norm": 0.6864156126976013, "learning_rate": 0.0002, "epoch": 4.1583054626532885, "step": 3730}, {"loss": 1.2987, "grad_norm": 0.8197885155677795, "learning_rate": 0.0002, "epoch": 4.169453734671126, "step": 3740}, {"loss": 1.3565, "grad_norm": 0.7169402837753296, "learning_rate": 0.0002, "epoch": 4.1806020066889635, "step": 3750}, {"loss": 1.4388, "grad_norm": 0.7948839068412781, "learning_rate": 0.0002, "epoch": 4.191750278706801, "step": 3760}, {"loss": 1.4648, "grad_norm": 0.6775302290916443, "learning_rate": 0.0002, "epoch": 4.202898550724638, "step": 3770}, {"loss": 1.3238, "grad_norm": 0.8913543820381165, "learning_rate": 0.0002, "epoch": 4.214046822742475, "step": 3780}, {"loss": 1.4251, "grad_norm": 0.8046368360519409, "learning_rate": 0.0002, "epoch": 4.225195094760312, "step": 3790}, {"loss": 1.3542, "grad_norm": 0.9359563589096069, "learning_rate": 0.0002, "epoch": 4.236343366778149, "step": 3800}, {"loss": 1.3963, "grad_norm": 0.8012228608131409, "learning_rate": 0.0002, "epoch": 4.247491638795987, "step": 3810}, {"loss": 1.311, "grad_norm": 0.8405851125717163, "learning_rate": 0.0002, "epoch": 4.258639910813824, "step": 3820}, {"loss": 1.3903, "grad_norm": 0.7812899351119995, "learning_rate": 0.0002, "epoch": 4.269788182831661, "step": 3830}, {"loss": 1.4006, "grad_norm": 0.8192463517189026, "learning_rate": 0.0002, "epoch": 4.280936454849498, "step": 3840}, {"loss": 1.3663, "grad_norm": 0.6937220096588135, "learning_rate": 0.0002, "epoch": 4.292084726867335, "step": 3850}, {"loss": 1.391, "grad_norm": 0.7245703935623169, "learning_rate": 0.0002, "epoch": 4.303232998885173, "step": 3860}, {"loss": 1.3351, "grad_norm": 0.7816787362098694, "learning_rate": 0.0002, "epoch": 4.31438127090301, "step": 3870}, {"loss": 1.4316, "grad_norm": 0.7904975414276123, "learning_rate": 0.0002, "epoch": 4.325529542920847, "step": 3880}, {"loss": 1.4722, "grad_norm": 1.0394847393035889, "learning_rate": 0.0002, "epoch": 4.336677814938684, "step": 3890}, {"loss": 1.4574, "grad_norm": 0.7044078707695007, "learning_rate": 0.0002, "epoch": 4.3478260869565215, "step": 3900}, {"loss": 1.3185, "grad_norm": 0.8852819204330444, "learning_rate": 0.0002, "epoch": 4.358974358974359, "step": 3910}, {"loss": 1.3664, "grad_norm": 0.7712758779525757, "learning_rate": 0.0002, "epoch": 4.3701226309921966, "step": 3920}, {"loss": 1.3519, "grad_norm": 0.7677774429321289, "learning_rate": 0.0002, "epoch": 4.381270903010034, "step": 3930}, {"loss": 1.3693, "grad_norm": 0.7450921535491943, "learning_rate": 0.0002, "epoch": 4.392419175027871, "step": 3940}, {"loss": 1.392, "grad_norm": 0.7802795767784119, "learning_rate": 0.0002, "epoch": 4.403567447045708, "step": 3950}, {"loss": 1.3661, "grad_norm": 0.8976508378982544, "learning_rate": 0.0002, "epoch": 4.414715719063545, "step": 3960}, {"loss": 1.4124, "grad_norm": 0.8148922324180603, "learning_rate": 0.0002, "epoch": 4.425863991081382, "step": 3970}, {"loss": 1.3937, "grad_norm": 0.7490504384040833, "learning_rate": 0.0002, "epoch": 4.43701226309922, "step": 3980}, {"loss": 1.393, "grad_norm": 0.753652036190033, "learning_rate": 0.0002, "epoch": 4.448160535117057, "step": 3990}, {"loss": 1.3467, "grad_norm": 0.803986668586731, "learning_rate": 0.0002, "epoch": 4.459308807134894, "step": 4000}, {"loss": 1.3872, "grad_norm": 0.8643081784248352, "learning_rate": 0.0002, "epoch": 4.470457079152731, "step": 4010}, {"loss": 1.407, "grad_norm": 0.8298280835151672, "learning_rate": 0.0002, "epoch": 4.481605351170568, "step": 4020}, {"loss": 1.4555, "grad_norm": 0.705355703830719, "learning_rate": 0.0002, "epoch": 4.492753623188406, "step": 4030}, {"loss": 1.3646, "grad_norm": 0.7845711708068848, "learning_rate": 0.0002, "epoch": 4.503901895206243, "step": 4040}, {"loss": 1.3913, "grad_norm": 0.8056256175041199, "learning_rate": 0.0002, "epoch": 4.51505016722408, "step": 4050}, {"loss": 1.3716, "grad_norm": 0.7080171704292297, "learning_rate": 0.0002, "epoch": 4.5261984392419174, "step": 4060}, {"loss": 1.335, "grad_norm": 0.778388261795044, "learning_rate": 0.0002, "epoch": 4.5373467112597545, "step": 4070}, {"loss": 1.3921, "grad_norm": 0.7337639927864075, "learning_rate": 0.0002, "epoch": 4.548494983277592, "step": 4080}, {"loss": 1.369, "grad_norm": 0.815322756767273, "learning_rate": 0.0002, "epoch": 4.55964325529543, "step": 4090}, {"loss": 1.4509, "grad_norm": 0.8817179203033447, "learning_rate": 0.0002, "epoch": 4.570791527313267, "step": 4100}, {"loss": 1.344, "grad_norm": 0.7526060342788696, "learning_rate": 0.0002, "epoch": 4.581939799331104, "step": 4110}, {"loss": 1.4027, "grad_norm": 0.920465350151062, "learning_rate": 0.0002, "epoch": 4.593088071348941, "step": 4120}, {"loss": 1.3757, "grad_norm": 0.7509559392929077, "learning_rate": 0.0002, "epoch": 4.604236343366778, "step": 4130}, {"loss": 1.4064, "grad_norm": 0.799469530582428, "learning_rate": 0.0002, "epoch": 4.615384615384615, "step": 4140}, {"loss": 1.3689, "grad_norm": 0.8099892735481262, "learning_rate": 0.0002, "epoch": 4.626532887402453, "step": 4150}, {"loss": 1.3689, "grad_norm": 0.7790375351905823, "learning_rate": 0.0002, "epoch": 4.63768115942029, "step": 4160}, {"loss": 1.4626, "grad_norm": 0.8292977809906006, "learning_rate": 0.0002, "epoch": 4.648829431438127, "step": 4170}, {"loss": 1.4505, "grad_norm": 0.8312386274337769, "learning_rate": 0.0002, "epoch": 4.659977703455964, "step": 4180}, {"loss": 1.4301, "grad_norm": 0.7348753809928894, "learning_rate": 0.0002, "epoch": 4.671125975473801, "step": 4190}, {"loss": 1.4074, "grad_norm": 0.8006551265716553, "learning_rate": 0.0002, "epoch": 4.682274247491639, "step": 4200}, {"loss": 1.4349, "grad_norm": 0.8477752804756165, "learning_rate": 0.0002, "epoch": 4.693422519509476, "step": 4210}, {"loss": 1.3943, "grad_norm": 0.7056546211242676, "learning_rate": 0.0002, "epoch": 4.704570791527313, "step": 4220}, {"loss": 1.3415, "grad_norm": 0.7858873009681702, "learning_rate": 0.0002, "epoch": 4.7157190635451505, "step": 4230}, {"loss": 1.3644, "grad_norm": 0.6968740224838257, "learning_rate": 0.0002, "epoch": 4.7268673355629875, "step": 4240}, {"loss": 1.3594, "grad_norm": 0.7886689901351929, "learning_rate": 0.0002, "epoch": 4.738015607580825, "step": 4250}, {"loss": 1.3783, "grad_norm": 0.8935304880142212, "learning_rate": 0.0002, "epoch": 4.749163879598662, "step": 4260}, {"loss": 1.3664, "grad_norm": 0.8395553231239319, "learning_rate": 0.0002, "epoch": 4.7603121516165, "step": 4270}, {"loss": 1.4113, "grad_norm": 0.817263126373291, "learning_rate": 0.0002, "epoch": 4.771460423634337, "step": 4280}, {"loss": 1.4181, "grad_norm": 0.7912008166313171, "learning_rate": 0.0002, "epoch": 4.782608695652174, "step": 4290}, {"loss": 1.4369, "grad_norm": 0.6637866497039795, "learning_rate": 0.0002, "epoch": 4.793756967670011, "step": 4300}, {"loss": 1.4328, "grad_norm": 1.0709338188171387, "learning_rate": 0.0002, "epoch": 4.804905239687848, "step": 4310}, {"loss": 1.4635, "grad_norm": 0.8179698586463928, "learning_rate": 0.0002, "epoch": 4.816053511705686, "step": 4320}, {"loss": 1.3757, "grad_norm": 0.7952052354812622, "learning_rate": 0.0002, "epoch": 4.827201783723523, "step": 4330}, {"loss": 1.3954, "grad_norm": 0.7235367894172668, "learning_rate": 0.0002, "epoch": 4.83835005574136, "step": 4340}, {"loss": 1.4668, "grad_norm": 0.8484606742858887, "learning_rate": 0.0002, "epoch": 4.849498327759197, "step": 4350}, {"loss": 1.3898, "grad_norm": 0.7344942092895508, "learning_rate": 0.0002, "epoch": 4.860646599777034, "step": 4360}, {"loss": 1.4519, "grad_norm": 0.9718546867370605, "learning_rate": 0.0002, "epoch": 4.871794871794872, "step": 4370}, {"loss": 1.4187, "grad_norm": 0.8174259066581726, "learning_rate": 0.0002, "epoch": 4.882943143812709, "step": 4380}, {"loss": 1.3244, "grad_norm": 0.8097165822982788, "learning_rate": 0.0002, "epoch": 4.894091415830546, "step": 4390}, {"loss": 1.3689, "grad_norm": 0.756388783454895, "learning_rate": 0.0002, "epoch": 4.9052396878483835, "step": 4400}, {"loss": 1.4129, "grad_norm": 0.8324617743492126, "learning_rate": 0.0002, "epoch": 4.916387959866221, "step": 4410}, {"loss": 1.3662, "grad_norm": 0.8949803709983826, "learning_rate": 0.0002, "epoch": 4.927536231884058, "step": 4420}, {"loss": 1.4632, "grad_norm": 0.7663722634315491, "learning_rate": 0.0002, "epoch": 4.938684503901895, "step": 4430}, {"loss": 1.3829, "grad_norm": 0.7727946043014526, "learning_rate": 0.0002, "epoch": 4.949832775919733, "step": 4440}, {"loss": 1.4351, "grad_norm": 0.6872350573539734, "learning_rate": 0.0002, "epoch": 4.96098104793757, "step": 4450}, {"loss": 1.4552, "grad_norm": 0.754357099533081, "learning_rate": 0.0002, "epoch": 4.972129319955407, "step": 4460}, {"loss": 1.4, "grad_norm": 0.8068729639053345, "learning_rate": 0.0002, "epoch": 4.983277591973244, "step": 4470}, {"loss": 1.3891, "grad_norm": 0.8200556635856628, "learning_rate": 0.0002, "epoch": 4.994425863991081, "step": 4480}]} +{"epoch": 6.0, "step": 5382, "epoch_duration": 992.7615306377411, "total_accumulated_duration": 5915.777344942093, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}, {"eval_loss": 1.8143481016159058, "eval_runtime": 37.921, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 1.0, "step": 897}, {"loss": 1.8029, "grad_norm": 0.29757317900657654, "learning_rate": 0.0002, "epoch": 1.0033444816053512, "step": 900}, {"loss": 1.7376, "grad_norm": 0.32168492674827576, "learning_rate": 0.0002, "epoch": 1.0144927536231885, "step": 910}, {"loss": 1.6785, "grad_norm": 0.3430717885494232, "learning_rate": 0.0002, "epoch": 1.0256410256410255, "step": 920}, {"loss": 1.7356, "grad_norm": 0.3431745767593384, "learning_rate": 0.0002, "epoch": 1.0367892976588629, "step": 930}, {"loss": 1.7932, "grad_norm": 0.39787548780441284, "learning_rate": 0.0002, "epoch": 1.0479375696767002, "step": 940}, {"loss": 1.7434, "grad_norm": 0.3540935218334198, "learning_rate": 0.0002, "epoch": 1.0590858416945372, "step": 950}, {"loss": 1.7693, "grad_norm": 0.368484765291214, "learning_rate": 0.0002, "epoch": 1.0702341137123745, "step": 960}, {"loss": 1.6887, "grad_norm": 0.41324466466903687, "learning_rate": 0.0002, "epoch": 1.0813823857302118, "step": 970}, {"loss": 1.7288, "grad_norm": 0.3696419596672058, "learning_rate": 0.0002, "epoch": 1.0925306577480491, "step": 980}, {"loss": 1.7743, "grad_norm": 0.33832886815071106, "learning_rate": 0.0002, "epoch": 1.1036789297658862, "step": 990}, {"loss": 1.7445, "grad_norm": 0.4411991834640503, "learning_rate": 0.0002, "epoch": 1.1148272017837235, "step": 1000}, {"loss": 1.7699, "grad_norm": 0.3935333788394928, "learning_rate": 0.0002, "epoch": 1.1259754738015608, "step": 1010}, {"loss": 1.6909, "grad_norm": 0.32472893595695496, "learning_rate": 0.0002, "epoch": 1.137123745819398, "step": 1020}, {"loss": 1.6974, "grad_norm": 0.3455545902252197, "learning_rate": 0.0002, "epoch": 1.1482720178372352, "step": 1030}, {"loss": 1.7555, "grad_norm": 0.3995654582977295, "learning_rate": 0.0002, "epoch": 1.1594202898550725, "step": 1040}, {"loss": 1.7419, "grad_norm": 0.384056031703949, "learning_rate": 0.0002, "epoch": 1.1705685618729098, "step": 1050}, {"loss": 1.7693, "grad_norm": 0.4345705211162567, "learning_rate": 0.0002, "epoch": 1.1817168338907469, "step": 1060}, {"loss": 1.7219, "grad_norm": 0.3524057865142822, "learning_rate": 0.0002, "epoch": 1.1928651059085842, "step": 1070}, {"loss": 1.6701, "grad_norm": 0.4047132134437561, "learning_rate": 0.0002, "epoch": 1.2040133779264215, "step": 1080}, {"loss": 1.7035, "grad_norm": 0.365824431180954, "learning_rate": 0.0002, "epoch": 1.2151616499442586, "step": 1090}, {"loss": 1.7367, "grad_norm": 0.37048354744911194, "learning_rate": 0.0002, "epoch": 1.2263099219620959, "step": 1100}, {"loss": 1.7503, "grad_norm": 0.3753672242164612, "learning_rate": 0.0002, "epoch": 1.2374581939799332, "step": 1110}, {"loss": 1.6984, "grad_norm": 0.37887042760849, "learning_rate": 0.0002, "epoch": 1.2486064659977703, "step": 1120}, {"loss": 1.7866, "grad_norm": 0.3896579444408417, "learning_rate": 0.0002, "epoch": 1.2597547380156076, "step": 1130}, {"loss": 1.8085, "grad_norm": 0.3725394010543823, "learning_rate": 0.0002, "epoch": 1.2709030100334449, "step": 1140}, {"loss": 1.6942, "grad_norm": 0.373989999294281, "learning_rate": 0.0002, "epoch": 1.282051282051282, "step": 1150}, {"loss": 1.7566, "grad_norm": 0.4412260353565216, "learning_rate": 0.0002, "epoch": 1.2931995540691192, "step": 1160}, {"loss": 1.7425, "grad_norm": 0.38538658618927, "learning_rate": 0.0002, "epoch": 1.3043478260869565, "step": 1170}, {"loss": 1.6573, "grad_norm": 0.3644104599952698, "learning_rate": 0.0002, "epoch": 1.3154960981047936, "step": 1180}, {"loss": 1.6186, "grad_norm": 0.3615347743034363, "learning_rate": 0.0002, "epoch": 1.326644370122631, "step": 1190}, {"loss": 1.7575, "grad_norm": 0.4260489046573639, "learning_rate": 0.0002, "epoch": 1.3377926421404682, "step": 1200}, {"loss": 1.762, "grad_norm": 0.35236871242523193, "learning_rate": 0.0002, "epoch": 1.3489409141583055, "step": 1210}, {"loss": 1.7207, "grad_norm": 0.45456627011299133, "learning_rate": 0.0002, "epoch": 1.3600891861761428, "step": 1220}, {"loss": 1.7391, "grad_norm": 0.391541063785553, "learning_rate": 0.0002, "epoch": 1.37123745819398, "step": 1230}, {"loss": 1.7309, "grad_norm": 0.37955328822135925, "learning_rate": 0.0002, "epoch": 1.3823857302118172, "step": 1240}, {"loss": 1.7028, "grad_norm": 0.36955225467681885, "learning_rate": 0.0002, "epoch": 1.3935340022296545, "step": 1250}, {"loss": 1.7027, "grad_norm": 0.36156216263771057, "learning_rate": 0.0002, "epoch": 1.4046822742474916, "step": 1260}, {"loss": 1.8091, "grad_norm": 0.4083487391471863, "learning_rate": 0.0002, "epoch": 1.415830546265329, "step": 1270}, {"loss": 1.7551, "grad_norm": 0.420171320438385, "learning_rate": 0.0002, "epoch": 1.4269788182831662, "step": 1280}, {"loss": 1.7377, "grad_norm": 0.3581725060939789, "learning_rate": 0.0002, "epoch": 1.4381270903010033, "step": 1290}, {"loss": 1.728, "grad_norm": 0.3657953441143036, "learning_rate": 0.0002, "epoch": 1.4492753623188406, "step": 1300}, {"loss": 1.7116, "grad_norm": 0.3139931857585907, "learning_rate": 0.0002, "epoch": 1.4604236343366779, "step": 1310}, {"loss": 1.671, "grad_norm": 0.37750574946403503, "learning_rate": 0.0002, "epoch": 1.471571906354515, "step": 1320}, {"loss": 1.7663, "grad_norm": 0.37787437438964844, "learning_rate": 0.0002, "epoch": 1.4827201783723523, "step": 1330}, {"loss": 1.6403, "grad_norm": 0.39505279064178467, "learning_rate": 0.0002, "epoch": 1.4938684503901896, "step": 1340}, {"loss": 1.7745, "grad_norm": 0.39977672696113586, "learning_rate": 0.0002, "epoch": 1.5050167224080266, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.4395383298397064, "learning_rate": 0.0002, "epoch": 1.516164994425864, "step": 1360}, {"loss": 1.7315, "grad_norm": 0.3452998995780945, "learning_rate": 0.0002, "epoch": 1.5273132664437012, "step": 1370}, {"loss": 1.7244, "grad_norm": 0.39573904871940613, "learning_rate": 0.0002, "epoch": 1.5384615384615383, "step": 1380}, {"loss": 1.7453, "grad_norm": 0.4886358976364136, "learning_rate": 0.0002, "epoch": 1.5496098104793758, "step": 1390}, {"loss": 1.7294, "grad_norm": 0.35525891184806824, "learning_rate": 0.0002, "epoch": 1.560758082497213, "step": 1400}, {"loss": 1.6896, "grad_norm": 0.3873274028301239, "learning_rate": 0.0002, "epoch": 1.57190635451505, "step": 1410}, {"loss": 1.7545, "grad_norm": 0.35162487626075745, "learning_rate": 0.0002, "epoch": 1.5830546265328875, "step": 1420}, {"loss": 1.7403, "grad_norm": 0.3533175587654114, "learning_rate": 0.0002, "epoch": 1.5942028985507246, "step": 1430}, {"loss": 1.7199, "grad_norm": 0.35397887229919434, "learning_rate": 0.0002, "epoch": 1.605351170568562, "step": 1440}, {"loss": 1.701, "grad_norm": 0.3539091646671295, "learning_rate": 0.0002, "epoch": 1.6164994425863992, "step": 1450}, {"loss": 1.7407, "grad_norm": 0.38557013869285583, "learning_rate": 0.0002, "epoch": 1.6276477146042363, "step": 1460}, {"loss": 1.6896, "grad_norm": 0.3591409921646118, "learning_rate": 0.0002, "epoch": 1.6387959866220736, "step": 1470}, {"loss": 1.6831, "grad_norm": 0.3776722848415375, "learning_rate": 0.0002, "epoch": 1.649944258639911, "step": 1480}, {"loss": 1.7511, "grad_norm": 0.3761521875858307, "learning_rate": 0.0002, "epoch": 1.661092530657748, "step": 1490}, {"loss": 1.7464, "grad_norm": 0.33939364552497864, "learning_rate": 0.0002, "epoch": 1.6722408026755853, "step": 1500}, {"loss": 1.6522, "grad_norm": 0.3961067795753479, "learning_rate": 0.0002, "epoch": 1.6833890746934226, "step": 1510}, {"loss": 1.7849, "grad_norm": 0.36793094873428345, "learning_rate": 0.0002, "epoch": 1.6945373467112597, "step": 1520}, {"loss": 1.7057, "grad_norm": 0.4201025068759918, "learning_rate": 0.0002, "epoch": 1.705685618729097, "step": 1530}, {"loss": 1.6656, "grad_norm": 0.382280558347702, "learning_rate": 0.0002, "epoch": 1.7168338907469343, "step": 1540}, {"loss": 1.7987, "grad_norm": 0.4504372477531433, "learning_rate": 0.0002, "epoch": 1.7279821627647713, "step": 1550}, {"loss": 1.7889, "grad_norm": 0.36121585965156555, "learning_rate": 0.0002, "epoch": 1.7391304347826086, "step": 1560}, {"loss": 1.7282, "grad_norm": 0.38416755199432373, "learning_rate": 0.0002, "epoch": 1.750278706800446, "step": 1570}, {"loss": 1.7759, "grad_norm": 0.3920411467552185, "learning_rate": 0.0002, "epoch": 1.761426978818283, "step": 1580}, {"loss": 1.7693, "grad_norm": 0.4326777756214142, "learning_rate": 0.0002, "epoch": 1.7725752508361206, "step": 1590}, {"loss": 1.6804, "grad_norm": 0.3582489490509033, "learning_rate": 0.0002, "epoch": 1.7837235228539576, "step": 1600}, {"loss": 1.706, "grad_norm": 0.36345767974853516, "learning_rate": 0.0002, "epoch": 1.7948717948717947, "step": 1610}, {"loss": 1.75, "grad_norm": 0.3951990008354187, "learning_rate": 0.0002, "epoch": 1.8060200668896322, "step": 1620}, {"loss": 1.8034, "grad_norm": 0.35174235701560974, "learning_rate": 0.0002, "epoch": 1.8171683389074693, "step": 1630}, {"loss": 1.725, "grad_norm": 0.37005263566970825, "learning_rate": 0.0002, "epoch": 1.8283166109253066, "step": 1640}, {"loss": 1.695, "grad_norm": 0.42875173687934875, "learning_rate": 0.0002, "epoch": 1.839464882943144, "step": 1650}, {"loss": 1.7589, "grad_norm": 0.3646032512187958, "learning_rate": 0.0002, "epoch": 1.850613154960981, "step": 1660}, {"loss": 1.6698, "grad_norm": 0.38111618161201477, "learning_rate": 0.0002, "epoch": 1.8617614269788183, "step": 1670}, {"loss": 1.7832, "grad_norm": 0.3825555443763733, "learning_rate": 0.0002, "epoch": 1.8729096989966556, "step": 1680}, {"loss": 1.7599, "grad_norm": 0.36418095231056213, "learning_rate": 0.0002, "epoch": 1.8840579710144927, "step": 1690}, {"loss": 1.6532, "grad_norm": 0.36551007628440857, "learning_rate": 0.0002, "epoch": 1.89520624303233, "step": 1700}, {"loss": 1.7174, "grad_norm": 0.36421480774879456, "learning_rate": 0.0002, "epoch": 1.9063545150501673, "step": 1710}, {"loss": 1.7176, "grad_norm": 0.3791242241859436, "learning_rate": 0.0002, "epoch": 1.9175027870680044, "step": 1720}, {"loss": 1.7961, "grad_norm": 0.36655193567276, "learning_rate": 0.0002, "epoch": 1.9286510590858417, "step": 1730}, {"loss": 1.7765, "grad_norm": 0.3526945412158966, "learning_rate": 0.0002, "epoch": 1.939799331103679, "step": 1740}, {"loss": 1.7047, "grad_norm": 0.41139861941337585, "learning_rate": 0.0002, "epoch": 1.950947603121516, "step": 1750}, {"loss": 1.8155, "grad_norm": 0.41757065057754517, "learning_rate": 0.0002, "epoch": 1.9620958751393534, "step": 1760}, {"loss": 1.7271, "grad_norm": 0.38956186175346375, "learning_rate": 0.0002, "epoch": 1.9732441471571907, "step": 1770}, {"loss": 1.7653, "grad_norm": 0.33891627192497253, "learning_rate": 0.0002, "epoch": 1.9843924191750277, "step": 1780}, {"loss": 1.7305, "grad_norm": 0.42879191040992737, "learning_rate": 0.0002, "epoch": 1.9955406911928653, "step": 1790}, {"eval_loss": 1.8116765022277832, "eval_runtime": 37.9859, "eval_samples_per_second": 13.558, "eval_steps_per_second": 1.711, "epoch": 2.0, "step": 1794}, {"loss": 1.6724, "grad_norm": 0.42103368043899536, "learning_rate": 0.0002, "epoch": 2.0066889632107023, "step": 1800}, {"loss": 1.5812, "grad_norm": 0.41505053639411926, "learning_rate": 0.0002, "epoch": 2.0178372352285394, "step": 1810}, {"loss": 1.6132, "grad_norm": 0.398190438747406, "learning_rate": 0.0002, "epoch": 2.028985507246377, "step": 1820}, {"loss": 1.6497, "grad_norm": 0.4371621310710907, "learning_rate": 0.0002, "epoch": 2.040133779264214, "step": 1830}, {"loss": 1.6501, "grad_norm": 0.45679208636283875, "learning_rate": 0.0002, "epoch": 2.051282051282051, "step": 1840}, {"loss": 1.5773, "grad_norm": 0.43211811780929565, "learning_rate": 0.0002, "epoch": 2.0624303232998886, "step": 1850}, {"loss": 1.6414, "grad_norm": 0.47492915391921997, "learning_rate": 0.0002, "epoch": 2.0735785953177257, "step": 1860}, {"loss": 1.7169, "grad_norm": 0.41742339730262756, "learning_rate": 0.0002, "epoch": 2.084726867335563, "step": 1870}, {"loss": 1.5762, "grad_norm": 0.45789217948913574, "learning_rate": 0.0002, "epoch": 2.0958751393534003, "step": 1880}, {"loss": 1.6896, "grad_norm": 0.43958935141563416, "learning_rate": 0.0002, "epoch": 2.1070234113712374, "step": 1890}, {"loss": 1.6444, "grad_norm": 0.43991968035697937, "learning_rate": 0.0002, "epoch": 2.1181716833890745, "step": 1900}, {"loss": 1.6057, "grad_norm": 0.4667953848838806, "learning_rate": 0.0002, "epoch": 2.129319955406912, "step": 1910}, {"loss": 1.5999, "grad_norm": 0.42225760221481323, "learning_rate": 0.0002, "epoch": 2.140468227424749, "step": 1920}, {"loss": 1.6525, "grad_norm": 0.418850839138031, "learning_rate": 0.0002, "epoch": 2.1516164994425866, "step": 1930}, {"loss": 1.6091, "grad_norm": 0.43838515877723694, "learning_rate": 0.0002, "epoch": 2.1627647714604237, "step": 1940}, {"loss": 1.6837, "grad_norm": 0.43798115849494934, "learning_rate": 0.0002, "epoch": 2.1739130434782608, "step": 1950}, {"loss": 1.632, "grad_norm": 0.4456610679626465, "learning_rate": 0.0002, "epoch": 2.1850613154960983, "step": 1960}, {"loss": 1.6338, "grad_norm": 0.4619026482105255, "learning_rate": 0.0002, "epoch": 2.1962095875139354, "step": 1970}, {"loss": 1.6989, "grad_norm": 0.4732453525066376, "learning_rate": 0.0002, "epoch": 2.2073578595317724, "step": 1980}, {"loss": 1.581, "grad_norm": 0.42551836371421814, "learning_rate": 0.0002, "epoch": 2.21850613154961, "step": 1990}, {"loss": 1.6386, "grad_norm": 0.45154353976249695, "learning_rate": 0.0002, "epoch": 2.229654403567447, "step": 2000}, {"loss": 1.6768, "grad_norm": 0.4655696451663971, "learning_rate": 0.0002, "epoch": 2.240802675585284, "step": 2010}, {"loss": 1.6972, "grad_norm": 0.5363447666168213, "learning_rate": 0.0002, "epoch": 2.2519509476031216, "step": 2020}, {"loss": 1.6561, "grad_norm": 0.4839927852153778, "learning_rate": 0.0002, "epoch": 2.2630992196209587, "step": 2030}, {"loss": 1.6838, "grad_norm": 0.4639221727848053, "learning_rate": 0.0002, "epoch": 2.274247491638796, "step": 2040}, {"loss": 1.6063, "grad_norm": 0.46169278025627136, "learning_rate": 0.0002, "epoch": 2.2853957636566333, "step": 2050}, {"loss": 1.5924, "grad_norm": 0.4582304060459137, "learning_rate": 0.0002, "epoch": 2.2965440356744704, "step": 2060}, {"loss": 1.5778, "grad_norm": 0.48619818687438965, "learning_rate": 0.0002, "epoch": 2.3076923076923075, "step": 2070}, {"loss": 1.633, "grad_norm": 0.4382200241088867, "learning_rate": 0.0002, "epoch": 2.318840579710145, "step": 2080}, {"loss": 1.5854, "grad_norm": 0.4103265106678009, "learning_rate": 0.0002, "epoch": 2.329988851727982, "step": 2090}, {"loss": 1.7042, "grad_norm": 0.5136023759841919, "learning_rate": 0.0002, "epoch": 2.3411371237458196, "step": 2100}, {"loss": 1.5723, "grad_norm": 0.46723702549934387, "learning_rate": 0.0002, "epoch": 2.3522853957636567, "step": 2110}, {"loss": 1.6852, "grad_norm": 0.42269468307495117, "learning_rate": 0.0002, "epoch": 2.3634336677814938, "step": 2120}, {"loss": 1.6369, "grad_norm": 0.42611163854599, "learning_rate": 0.0002, "epoch": 2.374581939799331, "step": 2130}, {"loss": 1.5879, "grad_norm": 0.4573901891708374, "learning_rate": 0.0002, "epoch": 2.3857302118171684, "step": 2140}, {"loss": 1.6317, "grad_norm": 0.4758673310279846, "learning_rate": 0.0002, "epoch": 2.3968784838350055, "step": 2150}, {"loss": 1.6527, "grad_norm": 0.49616846442222595, "learning_rate": 0.0002, "epoch": 2.408026755852843, "step": 2160}, {"loss": 1.5796, "grad_norm": 0.5278240442276001, "learning_rate": 0.0002, "epoch": 2.41917502787068, "step": 2170}, {"loss": 1.6746, "grad_norm": 0.46806028485298157, "learning_rate": 0.0002, "epoch": 2.430323299888517, "step": 2180}, {"loss": 1.676, "grad_norm": 0.44507312774658203, "learning_rate": 0.0002, "epoch": 2.4414715719063547, "step": 2190}, {"loss": 1.6793, "grad_norm": 0.45716050267219543, "learning_rate": 0.0002, "epoch": 2.4526198439241917, "step": 2200}, {"loss": 1.6198, "grad_norm": 0.4226573705673218, "learning_rate": 0.0002, "epoch": 2.463768115942029, "step": 2210}, {"loss": 1.5721, "grad_norm": 0.4488418400287628, "learning_rate": 0.0002, "epoch": 2.4749163879598663, "step": 2220}, {"loss": 1.6399, "grad_norm": 0.48324450850486755, "learning_rate": 0.0002, "epoch": 2.4860646599777034, "step": 2230}, {"loss": 1.6228, "grad_norm": 0.4866982400417328, "learning_rate": 0.0002, "epoch": 2.4972129319955405, "step": 2240}, {"loss": 1.6887, "grad_norm": 0.4784172773361206, "learning_rate": 0.0002, "epoch": 2.508361204013378, "step": 2250}, {"loss": 1.6905, "grad_norm": 0.4250621199607849, "learning_rate": 0.0002, "epoch": 2.519509476031215, "step": 2260}, {"loss": 1.6582, "grad_norm": 0.431224524974823, "learning_rate": 0.0002, "epoch": 2.5306577480490526, "step": 2270}, {"loss": 1.5981, "grad_norm": 0.3931371867656708, "learning_rate": 0.0002, "epoch": 2.5418060200668897, "step": 2280}, {"loss": 1.6897, "grad_norm": 0.4800887703895569, "learning_rate": 0.0002, "epoch": 2.552954292084727, "step": 2290}, {"loss": 1.6205, "grad_norm": 0.4288487136363983, "learning_rate": 0.0002, "epoch": 2.564102564102564, "step": 2300}, {"loss": 1.6005, "grad_norm": 0.48489660024642944, "learning_rate": 0.0002, "epoch": 2.5752508361204014, "step": 2310}, {"loss": 1.6447, "grad_norm": 0.4221740961074829, "learning_rate": 0.0002, "epoch": 2.5863991081382385, "step": 2320}, {"loss": 1.666, "grad_norm": 0.4413852393627167, "learning_rate": 0.0002, "epoch": 2.597547380156076, "step": 2330}, {"loss": 1.6863, "grad_norm": 0.4391345679759979, "learning_rate": 0.0002, "epoch": 2.608695652173913, "step": 2340}, {"loss": 1.6942, "grad_norm": 0.4824720323085785, "learning_rate": 0.0002, "epoch": 2.61984392419175, "step": 2350}, {"loss": 1.5615, "grad_norm": 0.4023158550262451, "learning_rate": 0.0002, "epoch": 2.6309921962095872, "step": 2360}, {"loss": 1.698, "grad_norm": 0.5107841491699219, "learning_rate": 0.0002, "epoch": 2.6421404682274248, "step": 2370}, {"loss": 1.6258, "grad_norm": 0.4705312252044678, "learning_rate": 0.0002, "epoch": 2.653288740245262, "step": 2380}, {"loss": 1.7294, "grad_norm": 0.4420899450778961, "learning_rate": 0.0002, "epoch": 2.6644370122630994, "step": 2390}, {"loss": 1.6246, "grad_norm": 0.413308709859848, "learning_rate": 0.0002, "epoch": 2.6755852842809364, "step": 2400}, {"loss": 1.565, "grad_norm": 0.4312658905982971, "learning_rate": 0.0002, "epoch": 2.6867335562987735, "step": 2410}, {"loss": 1.617, "grad_norm": 0.44714513421058655, "learning_rate": 0.0002, "epoch": 2.697881828316611, "step": 2420}, {"loss": 1.6185, "grad_norm": 0.49152931571006775, "learning_rate": 0.0002, "epoch": 2.709030100334448, "step": 2430}, {"loss": 1.5864, "grad_norm": 0.49458765983581543, "learning_rate": 0.0002, "epoch": 2.7201783723522857, "step": 2440}, {"loss": 1.6535, "grad_norm": 0.47838348150253296, "learning_rate": 0.0002, "epoch": 2.7313266443701227, "step": 2450}, {"loss": 1.6836, "grad_norm": 0.5781240463256836, "learning_rate": 0.0002, "epoch": 2.74247491638796, "step": 2460}, {"loss": 1.6141, "grad_norm": 0.4559851884841919, "learning_rate": 0.0002, "epoch": 2.753623188405797, "step": 2470}, {"loss": 1.5589, "grad_norm": 0.4452647566795349, "learning_rate": 0.0002, "epoch": 2.7647714604236344, "step": 2480}, {"loss": 1.6209, "grad_norm": 0.43920454382896423, "learning_rate": 0.0002, "epoch": 2.7759197324414715, "step": 2490}, {"loss": 1.5593, "grad_norm": 0.467780739068985, "learning_rate": 0.0002, "epoch": 2.787068004459309, "step": 2500}, {"loss": 1.6438, "grad_norm": 0.4743262529373169, "learning_rate": 0.0002, "epoch": 2.798216276477146, "step": 2510}, {"loss": 1.6084, "grad_norm": 0.47944432497024536, "learning_rate": 0.0002, "epoch": 2.809364548494983, "step": 2520}, {"loss": 1.6756, "grad_norm": 0.48032790422439575, "learning_rate": 0.0002, "epoch": 2.8205128205128203, "step": 2530}, {"loss": 1.6222, "grad_norm": 0.45569729804992676, "learning_rate": 0.0002, "epoch": 2.831661092530658, "step": 2540}, {"loss": 1.6187, "grad_norm": 0.47940587997436523, "learning_rate": 0.0002, "epoch": 2.842809364548495, "step": 2550}, {"loss": 1.6286, "grad_norm": 0.5215432047843933, "learning_rate": 0.0002, "epoch": 2.8539576365663324, "step": 2560}, {"loss": 1.6718, "grad_norm": 0.4421178102493286, "learning_rate": 0.0002, "epoch": 2.8651059085841695, "step": 2570}, {"loss": 1.6201, "grad_norm": 0.45288747549057007, "learning_rate": 0.0002, "epoch": 2.8762541806020065, "step": 2580}, {"loss": 1.5938, "grad_norm": 0.4472251832485199, "learning_rate": 0.0002, "epoch": 2.887402452619844, "step": 2590}, {"loss": 1.7212, "grad_norm": 0.4396503269672394, "learning_rate": 0.0002, "epoch": 2.898550724637681, "step": 2600}, {"loss": 1.6503, "grad_norm": 0.48590990900993347, "learning_rate": 0.0002, "epoch": 2.9096989966555182, "step": 2610}, {"loss": 1.5914, "grad_norm": 0.4787760376930237, "learning_rate": 0.0002, "epoch": 2.9208472686733558, "step": 2620}, {"loss": 1.717, "grad_norm": 0.4807611107826233, "learning_rate": 0.0002, "epoch": 2.931995540691193, "step": 2630}, {"loss": 1.6794, "grad_norm": 0.4625583291053772, "learning_rate": 0.0002, "epoch": 2.94314381270903, "step": 2640}, {"loss": 1.663, "grad_norm": 0.4163573980331421, "learning_rate": 0.0002, "epoch": 2.9542920847268674, "step": 2650}, {"loss": 1.6321, "grad_norm": 0.5142832398414612, "learning_rate": 0.0002, "epoch": 2.9654403567447045, "step": 2660}, {"loss": 1.6183, "grad_norm": 0.4459492564201355, "learning_rate": 0.0002, "epoch": 2.976588628762542, "step": 2670}, {"loss": 1.662, "grad_norm": 0.42905503511428833, "learning_rate": 0.0002, "epoch": 2.987736900780379, "step": 2680}, {"loss": 1.6796, "grad_norm": 0.44594648480415344, "learning_rate": 0.0002, "epoch": 2.998885172798216, "step": 2690}, {"eval_loss": 1.8300215005874634, "eval_runtime": 38.0349, "eval_samples_per_second": 13.54, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 2691}, {"loss": 1.5768, "grad_norm": 0.4742245078086853, "learning_rate": 0.0002, "epoch": 3.0100334448160537, "step": 2700}, {"loss": 1.4859, "grad_norm": 0.5157448649406433, "learning_rate": 0.0002, "epoch": 3.021181716833891, "step": 2710}, {"loss": 1.4219, "grad_norm": 0.5634726285934448, "learning_rate": 0.0002, "epoch": 3.032329988851728, "step": 2720}, {"loss": 1.5452, "grad_norm": 0.4554799199104309, "learning_rate": 0.0002, "epoch": 3.0434782608695654, "step": 2730}, {"loss": 1.4784, "grad_norm": 0.6565208435058594, "learning_rate": 0.0002, "epoch": 3.0546265328874025, "step": 2740}, {"loss": 1.459, "grad_norm": 0.6174370050430298, "learning_rate": 0.0002, "epoch": 3.0657748049052396, "step": 2750}, {"loss": 1.469, "grad_norm": 0.4987483024597168, "learning_rate": 0.0002, "epoch": 3.076923076923077, "step": 2760}, {"loss": 1.5466, "grad_norm": 0.5810927152633667, "learning_rate": 0.0002, "epoch": 3.088071348940914, "step": 2770}, {"loss": 1.4936, "grad_norm": 0.5281634330749512, "learning_rate": 0.0002, "epoch": 3.0992196209587513, "step": 2780}, {"loss": 1.4751, "grad_norm": 0.5479053854942322, "learning_rate": 0.0002, "epoch": 3.1103678929765888, "step": 2790}, {"loss": 1.5601, "grad_norm": 0.6192978620529175, "learning_rate": 0.0002, "epoch": 3.121516164994426, "step": 2800}, {"loss": 1.4888, "grad_norm": 0.560117781162262, "learning_rate": 0.0002, "epoch": 3.132664437012263, "step": 2810}, {"loss": 1.5495, "grad_norm": 0.6067224740982056, "learning_rate": 0.0002, "epoch": 3.1438127090301005, "step": 2820}, {"loss": 1.5239, "grad_norm": 0.611287534236908, "learning_rate": 0.0002, "epoch": 3.1549609810479375, "step": 2830}, {"loss": 1.4577, "grad_norm": 0.6441587209701538, "learning_rate": 0.0002, "epoch": 3.1661092530657746, "step": 2840}, {"loss": 1.5322, "grad_norm": 0.5955114364624023, "learning_rate": 0.0002, "epoch": 3.177257525083612, "step": 2850}, {"loss": 1.5222, "grad_norm": 0.5554782748222351, "learning_rate": 0.0002, "epoch": 3.1884057971014492, "step": 2860}, {"loss": 1.4676, "grad_norm": 0.5411370992660522, "learning_rate": 0.0002, "epoch": 3.1995540691192863, "step": 2870}, {"loss": 1.5008, "grad_norm": 0.6152016520500183, "learning_rate": 0.0002, "epoch": 3.210702341137124, "step": 2880}, {"loss": 1.5229, "grad_norm": 0.5711581110954285, "learning_rate": 0.0002, "epoch": 3.221850613154961, "step": 2890}, {"loss": 1.5255, "grad_norm": 0.5399307012557983, "learning_rate": 0.0002, "epoch": 3.2329988851727984, "step": 2900}, {"loss": 1.4888, "grad_norm": 0.60606849193573, "learning_rate": 0.0002, "epoch": 3.2441471571906355, "step": 2910}, {"loss": 1.5056, "grad_norm": 0.5873523950576782, "learning_rate": 0.0002, "epoch": 3.2552954292084726, "step": 2920}, {"loss": 1.5208, "grad_norm": 0.6149439215660095, "learning_rate": 0.0002, "epoch": 3.26644370122631, "step": 2930}, {"loss": 1.4942, "grad_norm": 0.5940659046173096, "learning_rate": 0.0002, "epoch": 3.277591973244147, "step": 2940}, {"loss": 1.5031, "grad_norm": 0.6846756339073181, "learning_rate": 0.0002, "epoch": 3.2887402452619843, "step": 2950}, {"loss": 1.5425, "grad_norm": 0.6708254218101501, "learning_rate": 0.0002, "epoch": 3.299888517279822, "step": 2960}, {"loss": 1.5319, "grad_norm": 0.5966503620147705, "learning_rate": 0.0002, "epoch": 3.311036789297659, "step": 2970}, {"loss": 1.5173, "grad_norm": 0.6328812837600708, "learning_rate": 0.0002, "epoch": 3.322185061315496, "step": 2980}, {"loss": 1.5096, "grad_norm": 0.6082745790481567, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2990}, {"loss": 1.5122, "grad_norm": 0.6207539439201355, "learning_rate": 0.0002, "epoch": 3.3444816053511706, "step": 3000}, {"loss": 1.5053, "grad_norm": 0.5501793026924133, "learning_rate": 0.0002, "epoch": 3.3556298773690076, "step": 3010}, {"loss": 1.4428, "grad_norm": 0.571275532245636, "learning_rate": 0.0002, "epoch": 3.366778149386845, "step": 3020}, {"loss": 1.5914, "grad_norm": 0.7003518342971802, "learning_rate": 0.0002, "epoch": 3.3779264214046822, "step": 3030}, {"loss": 1.5359, "grad_norm": 0.609527587890625, "learning_rate": 0.0002, "epoch": 3.3890746934225193, "step": 3040}, {"loss": 1.5072, "grad_norm": 0.5880036354064941, "learning_rate": 0.0002, "epoch": 3.400222965440357, "step": 3050}, {"loss": 1.5451, "grad_norm": 0.5847334265708923, "learning_rate": 0.0002, "epoch": 3.411371237458194, "step": 3060}, {"loss": 1.4738, "grad_norm": 0.5373924970626831, "learning_rate": 0.0002, "epoch": 3.4225195094760315, "step": 3070}, {"loss": 1.5215, "grad_norm": 0.6074833869934082, "learning_rate": 0.0002, "epoch": 3.4336677814938685, "step": 3080}, {"loss": 1.458, "grad_norm": 0.5118414163589478, "learning_rate": 0.0002, "epoch": 3.4448160535117056, "step": 3090}, {"loss": 1.5006, "grad_norm": 0.5577956438064575, "learning_rate": 0.0002, "epoch": 3.4559643255295427, "step": 3100}, {"loss": 1.5057, "grad_norm": 0.5654811859130859, "learning_rate": 0.0002, "epoch": 3.46711259754738, "step": 3110}, {"loss": 1.523, "grad_norm": 0.6216017603874207, "learning_rate": 0.0002, "epoch": 3.4782608695652173, "step": 3120}, {"loss": 1.5292, "grad_norm": 0.5983642339706421, "learning_rate": 0.0002, "epoch": 3.489409141583055, "step": 3130}, {"loss": 1.5568, "grad_norm": 0.6635708212852478, "learning_rate": 0.0002, "epoch": 3.500557413600892, "step": 3140}, {"loss": 1.4633, "grad_norm": 0.6254258751869202, "learning_rate": 0.0002, "epoch": 3.511705685618729, "step": 3150}, {"loss": 1.4934, "grad_norm": 0.6359851360321045, "learning_rate": 0.0002, "epoch": 3.522853957636566, "step": 3160}, {"loss": 1.4693, "grad_norm": 0.5938616394996643, "learning_rate": 0.0002, "epoch": 3.5340022296544036, "step": 3170}, {"loss": 1.4393, "grad_norm": 0.6360630393028259, "learning_rate": 0.0002, "epoch": 3.5451505016722407, "step": 3180}, {"loss": 1.5535, "grad_norm": 0.6097670197486877, "learning_rate": 0.0002, "epoch": 3.556298773690078, "step": 3190}, {"loss": 1.5427, "grad_norm": 0.5984025597572327, "learning_rate": 0.0002, "epoch": 3.5674470457079153, "step": 3200}, {"loss": 1.4741, "grad_norm": 0.5463748574256897, "learning_rate": 0.0002, "epoch": 3.5785953177257523, "step": 3210}, {"loss": 1.513, "grad_norm": 1.0017699003219604, "learning_rate": 0.0002, "epoch": 3.58974358974359, "step": 3220}, {"loss": 1.5687, "grad_norm": 0.6519441604614258, "learning_rate": 0.0002, "epoch": 3.600891861761427, "step": 3230}, {"loss": 1.5168, "grad_norm": 0.6457271575927734, "learning_rate": 0.0002, "epoch": 3.6120401337792645, "step": 3240}, {"loss": 1.5511, "grad_norm": 0.5898868441581726, "learning_rate": 0.0002, "epoch": 3.6231884057971016, "step": 3250}, {"loss": 1.5833, "grad_norm": 0.6612270474433899, "learning_rate": 0.0002, "epoch": 3.6343366778149386, "step": 3260}, {"loss": 1.4537, "grad_norm": 0.5102090239524841, "learning_rate": 0.0002, "epoch": 3.6454849498327757, "step": 3270}, {"loss": 1.4676, "grad_norm": 0.5357231497764587, "learning_rate": 0.0002, "epoch": 3.6566332218506132, "step": 3280}, {"loss": 1.5417, "grad_norm": 0.6176130175590515, "learning_rate": 0.0002, "epoch": 3.6677814938684503, "step": 3290}, {"loss": 1.5057, "grad_norm": 0.6384354829788208, "learning_rate": 0.0002, "epoch": 3.678929765886288, "step": 3300}, {"loss": 1.5973, "grad_norm": 0.5493269562721252, "learning_rate": 0.0002, "epoch": 3.690078037904125, "step": 3310}, {"loss": 1.5958, "grad_norm": 0.5721797943115234, "learning_rate": 0.0002, "epoch": 3.701226309921962, "step": 3320}, {"loss": 1.5098, "grad_norm": 0.6667633056640625, "learning_rate": 0.0002, "epoch": 3.712374581939799, "step": 3330}, {"loss": 1.5372, "grad_norm": 0.5713372826576233, "learning_rate": 0.0002, "epoch": 3.7235228539576366, "step": 3340}, {"loss": 1.5959, "grad_norm": 0.5925018191337585, "learning_rate": 0.0002, "epoch": 3.7346711259754737, "step": 3350}, {"loss": 1.5045, "grad_norm": 0.5660955905914307, "learning_rate": 0.0002, "epoch": 3.745819397993311, "step": 3360}, {"loss": 1.5465, "grad_norm": 0.5470759868621826, "learning_rate": 0.0002, "epoch": 3.7569676700111483, "step": 3370}, {"loss": 1.547, "grad_norm": 0.7612935900688171, "learning_rate": 0.0002, "epoch": 3.7681159420289854, "step": 3380}, {"loss": 1.6224, "grad_norm": 0.577467679977417, "learning_rate": 0.0002, "epoch": 3.779264214046823, "step": 3390}, {"loss": 1.5653, "grad_norm": 0.6125091910362244, "learning_rate": 0.0002, "epoch": 3.79041248606466, "step": 3400}, {"loss": 1.5463, "grad_norm": 0.590386152267456, "learning_rate": 0.0002, "epoch": 3.801560758082497, "step": 3410}, {"loss": 1.5944, "grad_norm": 0.5530361533164978, "learning_rate": 0.0002, "epoch": 3.8127090301003346, "step": 3420}, {"loss": 1.4797, "grad_norm": 0.5714079737663269, "learning_rate": 0.0002, "epoch": 3.8238573021181717, "step": 3430}, {"loss": 1.5324, "grad_norm": 0.9061086773872375, "learning_rate": 0.0002, "epoch": 3.8350055741360087, "step": 3440}, {"loss": 1.4513, "grad_norm": 0.6193320751190186, "learning_rate": 0.0002, "epoch": 3.8461538461538463, "step": 3450}, {"loss": 1.5537, "grad_norm": 0.5831704139709473, "learning_rate": 0.0002, "epoch": 3.8573021181716833, "step": 3460}, {"loss": 1.5144, "grad_norm": 0.5971192717552185, "learning_rate": 0.0002, "epoch": 3.868450390189521, "step": 3470}, {"loss": 1.484, "grad_norm": 0.6110154390335083, "learning_rate": 0.0002, "epoch": 3.879598662207358, "step": 3480}, {"loss": 1.5624, "grad_norm": 0.6644453406333923, "learning_rate": 0.0002, "epoch": 3.890746934225195, "step": 3490}, {"loss": 1.5422, "grad_norm": 0.6674908399581909, "learning_rate": 0.0002, "epoch": 3.901895206243032, "step": 3500}, {"loss": 1.579, "grad_norm": 0.5516519546508789, "learning_rate": 0.0002, "epoch": 3.9130434782608696, "step": 3510}, {"loss": 1.5964, "grad_norm": 0.6704319715499878, "learning_rate": 0.0002, "epoch": 3.9241917502787067, "step": 3520}, {"loss": 1.515, "grad_norm": 0.5820314288139343, "learning_rate": 0.0002, "epoch": 3.9353400222965442, "step": 3530}, {"loss": 1.6458, "grad_norm": 0.6931548714637756, "learning_rate": 0.0002, "epoch": 3.9464882943143813, "step": 3540}, {"loss": 1.5338, "grad_norm": 0.6085171103477478, "learning_rate": 0.0002, "epoch": 3.9576365663322184, "step": 3550}, {"loss": 1.5537, "grad_norm": 0.5973535776138306, "learning_rate": 0.0002, "epoch": 3.9687848383500555, "step": 3560}, {"loss": 1.5435, "grad_norm": 0.49761658906936646, "learning_rate": 0.0002, "epoch": 3.979933110367893, "step": 3570}, {"loss": 1.488, "grad_norm": 0.6282512545585632, "learning_rate": 0.0002, "epoch": 3.99108138238573, "step": 3580}, {"eval_loss": 1.8790398836135864, "eval_runtime": 37.9725, "eval_samples_per_second": 13.562, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 3588}, {"loss": 1.5025, "grad_norm": 0.6402973532676697, "learning_rate": 0.0002, "epoch": 4.002229654403568, "step": 3590}, {"loss": 1.3695, "grad_norm": 0.7791030406951904, "learning_rate": 0.0002, "epoch": 4.013377926421405, "step": 3600}, {"loss": 1.3545, "grad_norm": 0.7136624455451965, "learning_rate": 0.0002, "epoch": 4.024526198439242, "step": 3610}, {"loss": 1.3515, "grad_norm": 0.7608486413955688, "learning_rate": 0.0002, "epoch": 4.035674470457079, "step": 3620}, {"loss": 1.3067, "grad_norm": 0.7486591935157776, "learning_rate": 0.0002, "epoch": 4.046822742474917, "step": 3630}, {"loss": 1.3474, "grad_norm": 0.7576302289962769, "learning_rate": 0.0002, "epoch": 4.057971014492754, "step": 3640}, {"loss": 1.3036, "grad_norm": 0.7358254194259644, "learning_rate": 0.0002, "epoch": 4.069119286510591, "step": 3650}, {"loss": 1.3015, "grad_norm": 0.821326494216919, "learning_rate": 0.0002, "epoch": 4.080267558528428, "step": 3660}, {"loss": 1.4186, "grad_norm": 0.7996482253074646, "learning_rate": 0.0002, "epoch": 4.091415830546265, "step": 3670}, {"loss": 1.3671, "grad_norm": 0.8527022004127502, "learning_rate": 0.0002, "epoch": 4.102564102564102, "step": 3680}, {"loss": 1.3818, "grad_norm": 0.7313576340675354, "learning_rate": 0.0002, "epoch": 4.11371237458194, "step": 3690}, {"loss": 1.3307, "grad_norm": 0.7854588627815247, "learning_rate": 0.0002, "epoch": 4.124860646599777, "step": 3700}, {"loss": 1.4174, "grad_norm": 0.6588303446769714, "learning_rate": 0.0002, "epoch": 4.136008918617614, "step": 3710}, {"loss": 1.3674, "grad_norm": 0.7986254692077637, "learning_rate": 0.0002, "epoch": 4.147157190635451, "step": 3720}, {"loss": 1.3505, "grad_norm": 0.6864156126976013, "learning_rate": 0.0002, "epoch": 4.1583054626532885, "step": 3730}, {"loss": 1.2987, "grad_norm": 0.8197885155677795, "learning_rate": 0.0002, "epoch": 4.169453734671126, "step": 3740}, {"loss": 1.3565, "grad_norm": 0.7169402837753296, "learning_rate": 0.0002, "epoch": 4.1806020066889635, "step": 3750}, {"loss": 1.4388, "grad_norm": 0.7948839068412781, "learning_rate": 0.0002, "epoch": 4.191750278706801, "step": 3760}, {"loss": 1.4648, "grad_norm": 0.6775302290916443, "learning_rate": 0.0002, "epoch": 4.202898550724638, "step": 3770}, {"loss": 1.3238, "grad_norm": 0.8913543820381165, "learning_rate": 0.0002, "epoch": 4.214046822742475, "step": 3780}, {"loss": 1.4251, "grad_norm": 0.8046368360519409, "learning_rate": 0.0002, "epoch": 4.225195094760312, "step": 3790}, {"loss": 1.3542, "grad_norm": 0.9359563589096069, "learning_rate": 0.0002, "epoch": 4.236343366778149, "step": 3800}, {"loss": 1.3963, "grad_norm": 0.8012228608131409, "learning_rate": 0.0002, "epoch": 4.247491638795987, "step": 3810}, {"loss": 1.311, "grad_norm": 0.8405851125717163, "learning_rate": 0.0002, "epoch": 4.258639910813824, "step": 3820}, {"loss": 1.3903, "grad_norm": 0.7812899351119995, "learning_rate": 0.0002, "epoch": 4.269788182831661, "step": 3830}, {"loss": 1.4006, "grad_norm": 0.8192463517189026, "learning_rate": 0.0002, "epoch": 4.280936454849498, "step": 3840}, {"loss": 1.3663, "grad_norm": 0.6937220096588135, "learning_rate": 0.0002, "epoch": 4.292084726867335, "step": 3850}, {"loss": 1.391, "grad_norm": 0.7245703935623169, "learning_rate": 0.0002, "epoch": 4.303232998885173, "step": 3860}, {"loss": 1.3351, "grad_norm": 0.7816787362098694, "learning_rate": 0.0002, "epoch": 4.31438127090301, "step": 3870}, {"loss": 1.4316, "grad_norm": 0.7904975414276123, "learning_rate": 0.0002, "epoch": 4.325529542920847, "step": 3880}, {"loss": 1.4722, "grad_norm": 1.0394847393035889, "learning_rate": 0.0002, "epoch": 4.336677814938684, "step": 3890}, {"loss": 1.4574, "grad_norm": 0.7044078707695007, "learning_rate": 0.0002, "epoch": 4.3478260869565215, "step": 3900}, {"loss": 1.3185, "grad_norm": 0.8852819204330444, "learning_rate": 0.0002, "epoch": 4.358974358974359, "step": 3910}, {"loss": 1.3664, "grad_norm": 0.7712758779525757, "learning_rate": 0.0002, "epoch": 4.3701226309921966, "step": 3920}, {"loss": 1.3519, "grad_norm": 0.7677774429321289, "learning_rate": 0.0002, "epoch": 4.381270903010034, "step": 3930}, {"loss": 1.3693, "grad_norm": 0.7450921535491943, "learning_rate": 0.0002, "epoch": 4.392419175027871, "step": 3940}, {"loss": 1.392, "grad_norm": 0.7802795767784119, "learning_rate": 0.0002, "epoch": 4.403567447045708, "step": 3950}, {"loss": 1.3661, "grad_norm": 0.8976508378982544, "learning_rate": 0.0002, "epoch": 4.414715719063545, "step": 3960}, {"loss": 1.4124, "grad_norm": 0.8148922324180603, "learning_rate": 0.0002, "epoch": 4.425863991081382, "step": 3970}, {"loss": 1.3937, "grad_norm": 0.7490504384040833, "learning_rate": 0.0002, "epoch": 4.43701226309922, "step": 3980}, {"loss": 1.393, "grad_norm": 0.753652036190033, "learning_rate": 0.0002, "epoch": 4.448160535117057, "step": 3990}, {"loss": 1.3467, "grad_norm": 0.803986668586731, "learning_rate": 0.0002, "epoch": 4.459308807134894, "step": 4000}, {"loss": 1.3872, "grad_norm": 0.8643081784248352, "learning_rate": 0.0002, "epoch": 4.470457079152731, "step": 4010}, {"loss": 1.407, "grad_norm": 0.8298280835151672, "learning_rate": 0.0002, "epoch": 4.481605351170568, "step": 4020}, {"loss": 1.4555, "grad_norm": 0.705355703830719, "learning_rate": 0.0002, "epoch": 4.492753623188406, "step": 4030}, {"loss": 1.3646, "grad_norm": 0.7845711708068848, "learning_rate": 0.0002, "epoch": 4.503901895206243, "step": 4040}, {"loss": 1.3913, "grad_norm": 0.8056256175041199, "learning_rate": 0.0002, "epoch": 4.51505016722408, "step": 4050}, {"loss": 1.3716, "grad_norm": 0.7080171704292297, "learning_rate": 0.0002, "epoch": 4.5261984392419174, "step": 4060}, {"loss": 1.335, "grad_norm": 0.778388261795044, "learning_rate": 0.0002, "epoch": 4.5373467112597545, "step": 4070}, {"loss": 1.3921, "grad_norm": 0.7337639927864075, "learning_rate": 0.0002, "epoch": 4.548494983277592, "step": 4080}, {"loss": 1.369, "grad_norm": 0.815322756767273, "learning_rate": 0.0002, "epoch": 4.55964325529543, "step": 4090}, {"loss": 1.4509, "grad_norm": 0.8817179203033447, "learning_rate": 0.0002, "epoch": 4.570791527313267, "step": 4100}, {"loss": 1.344, "grad_norm": 0.7526060342788696, "learning_rate": 0.0002, "epoch": 4.581939799331104, "step": 4110}, {"loss": 1.4027, "grad_norm": 0.920465350151062, "learning_rate": 0.0002, "epoch": 4.593088071348941, "step": 4120}, {"loss": 1.3757, "grad_norm": 0.7509559392929077, "learning_rate": 0.0002, "epoch": 4.604236343366778, "step": 4130}, {"loss": 1.4064, "grad_norm": 0.799469530582428, "learning_rate": 0.0002, "epoch": 4.615384615384615, "step": 4140}, {"loss": 1.3689, "grad_norm": 0.8099892735481262, "learning_rate": 0.0002, "epoch": 4.626532887402453, "step": 4150}, {"loss": 1.3689, "grad_norm": 0.7790375351905823, "learning_rate": 0.0002, "epoch": 4.63768115942029, "step": 4160}, {"loss": 1.4626, "grad_norm": 0.8292977809906006, "learning_rate": 0.0002, "epoch": 4.648829431438127, "step": 4170}, {"loss": 1.4505, "grad_norm": 0.8312386274337769, "learning_rate": 0.0002, "epoch": 4.659977703455964, "step": 4180}, {"loss": 1.4301, "grad_norm": 0.7348753809928894, "learning_rate": 0.0002, "epoch": 4.671125975473801, "step": 4190}, {"loss": 1.4074, "grad_norm": 0.8006551265716553, "learning_rate": 0.0002, "epoch": 4.682274247491639, "step": 4200}, {"loss": 1.4349, "grad_norm": 0.8477752804756165, "learning_rate": 0.0002, "epoch": 4.693422519509476, "step": 4210}, {"loss": 1.3943, "grad_norm": 0.7056546211242676, "learning_rate": 0.0002, "epoch": 4.704570791527313, "step": 4220}, {"loss": 1.3415, "grad_norm": 0.7858873009681702, "learning_rate": 0.0002, "epoch": 4.7157190635451505, "step": 4230}, {"loss": 1.3644, "grad_norm": 0.6968740224838257, "learning_rate": 0.0002, "epoch": 4.7268673355629875, "step": 4240}, {"loss": 1.3594, "grad_norm": 0.7886689901351929, "learning_rate": 0.0002, "epoch": 4.738015607580825, "step": 4250}, {"loss": 1.3783, "grad_norm": 0.8935304880142212, "learning_rate": 0.0002, "epoch": 4.749163879598662, "step": 4260}, {"loss": 1.3664, "grad_norm": 0.8395553231239319, "learning_rate": 0.0002, "epoch": 4.7603121516165, "step": 4270}, {"loss": 1.4113, "grad_norm": 0.817263126373291, "learning_rate": 0.0002, "epoch": 4.771460423634337, "step": 4280}, {"loss": 1.4181, "grad_norm": 0.7912008166313171, "learning_rate": 0.0002, "epoch": 4.782608695652174, "step": 4290}, {"loss": 1.4369, "grad_norm": 0.6637866497039795, "learning_rate": 0.0002, "epoch": 4.793756967670011, "step": 4300}, {"loss": 1.4328, "grad_norm": 1.0709338188171387, "learning_rate": 0.0002, "epoch": 4.804905239687848, "step": 4310}, {"loss": 1.4635, "grad_norm": 0.8179698586463928, "learning_rate": 0.0002, "epoch": 4.816053511705686, "step": 4320}, {"loss": 1.3757, "grad_norm": 0.7952052354812622, "learning_rate": 0.0002, "epoch": 4.827201783723523, "step": 4330}, {"loss": 1.3954, "grad_norm": 0.7235367894172668, "learning_rate": 0.0002, "epoch": 4.83835005574136, "step": 4340}, {"loss": 1.4668, "grad_norm": 0.8484606742858887, "learning_rate": 0.0002, "epoch": 4.849498327759197, "step": 4350}, {"loss": 1.3898, "grad_norm": 0.7344942092895508, "learning_rate": 0.0002, "epoch": 4.860646599777034, "step": 4360}, {"loss": 1.4519, "grad_norm": 0.9718546867370605, "learning_rate": 0.0002, "epoch": 4.871794871794872, "step": 4370}, {"loss": 1.4187, "grad_norm": 0.8174259066581726, "learning_rate": 0.0002, "epoch": 4.882943143812709, "step": 4380}, {"loss": 1.3244, "grad_norm": 0.8097165822982788, "learning_rate": 0.0002, "epoch": 4.894091415830546, "step": 4390}, {"loss": 1.3689, "grad_norm": 0.756388783454895, "learning_rate": 0.0002, "epoch": 4.9052396878483835, "step": 4400}, {"loss": 1.4129, "grad_norm": 0.8324617743492126, "learning_rate": 0.0002, "epoch": 4.916387959866221, "step": 4410}, {"loss": 1.3662, "grad_norm": 0.8949803709983826, "learning_rate": 0.0002, "epoch": 4.927536231884058, "step": 4420}, {"loss": 1.4632, "grad_norm": 0.7663722634315491, "learning_rate": 0.0002, "epoch": 4.938684503901895, "step": 4430}, {"loss": 1.3829, "grad_norm": 0.7727946043014526, "learning_rate": 0.0002, "epoch": 4.949832775919733, "step": 4440}, {"loss": 1.4351, "grad_norm": 0.6872350573539734, "learning_rate": 0.0002, "epoch": 4.96098104793757, "step": 4450}, {"loss": 1.4552, "grad_norm": 0.754357099533081, "learning_rate": 0.0002, "epoch": 4.972129319955407, "step": 4460}, {"loss": 1.4, "grad_norm": 0.8068729639053345, "learning_rate": 0.0002, "epoch": 4.983277591973244, "step": 4470}, {"loss": 1.3891, "grad_norm": 0.8200556635856628, "learning_rate": 0.0002, "epoch": 4.994425863991081, "step": 4480}, {"eval_loss": 1.9543706178665161, "eval_runtime": 37.9369, "eval_samples_per_second": 13.575, "eval_steps_per_second": 1.713, "epoch": 5.0, "step": 4485}, {"loss": 1.3194, "grad_norm": 0.7499465942382812, "learning_rate": 0.0002, "epoch": 5.005574136008919, "step": 4490}, {"loss": 1.2143, "grad_norm": 1.030434489250183, "learning_rate": 0.0002, "epoch": 5.016722408026756, "step": 4500}, {"loss": 1.2408, "grad_norm": 0.8914631605148315, "learning_rate": 0.0002, "epoch": 5.027870680044593, "step": 4510}, {"loss": 1.1448, "grad_norm": 0.9902928471565247, "learning_rate": 0.0002, "epoch": 5.03901895206243, "step": 4520}, {"loss": 1.2401, "grad_norm": 0.8338701128959656, "learning_rate": 0.0002, "epoch": 5.050167224080267, "step": 4530}, {"loss": 1.1952, "grad_norm": 0.9440169334411621, "learning_rate": 0.0002, "epoch": 5.061315496098104, "step": 4540}, {"loss": 1.2196, "grad_norm": 0.8755099177360535, "learning_rate": 0.0002, "epoch": 5.072463768115942, "step": 4550}, {"loss": 1.1806, "grad_norm": 0.9145820140838623, "learning_rate": 0.0002, "epoch": 5.083612040133779, "step": 4560}, {"loss": 1.147, "grad_norm": 1.0068492889404297, "learning_rate": 0.0002, "epoch": 5.0947603121516165, "step": 4570}, {"loss": 1.2192, "grad_norm": 0.9184673428535461, "learning_rate": 0.0002, "epoch": 5.105908584169454, "step": 4580}, {"loss": 1.2948, "grad_norm": 1.1158655881881714, "learning_rate": 0.0002, "epoch": 5.117056856187291, "step": 4590}, {"loss": 1.2423, "grad_norm": 0.9685078263282776, "learning_rate": 0.0002, "epoch": 5.128205128205128, "step": 4600}, {"loss": 1.2654, "grad_norm": 1.0389559268951416, "learning_rate": 0.0002, "epoch": 5.139353400222966, "step": 4610}, {"loss": 1.1965, "grad_norm": 1.0294485092163086, "learning_rate": 0.0002, "epoch": 5.150501672240803, "step": 4620}, {"loss": 1.296, "grad_norm": 0.9368783235549927, "learning_rate": 0.0002, "epoch": 5.16164994425864, "step": 4630}, {"loss": 1.206, "grad_norm": 0.9724945425987244, "learning_rate": 0.0002, "epoch": 5.172798216276477, "step": 4640}, {"loss": 1.2319, "grad_norm": 0.876488447189331, "learning_rate": 0.0002, "epoch": 5.183946488294314, "step": 4650}, {"loss": 1.2506, "grad_norm": 0.9106290340423584, "learning_rate": 0.0002, "epoch": 5.195094760312152, "step": 4660}, {"loss": 1.2896, "grad_norm": 1.0924615859985352, "learning_rate": 0.0002, "epoch": 5.206243032329989, "step": 4670}, {"loss": 1.245, "grad_norm": 1.0379078388214111, "learning_rate": 0.0002, "epoch": 5.217391304347826, "step": 4680}, {"loss": 1.2155, "grad_norm": 0.9507831931114197, "learning_rate": 0.0002, "epoch": 5.228539576365663, "step": 4690}, {"loss": 1.2318, "grad_norm": 1.0408620834350586, "learning_rate": 0.0002, "epoch": 5.2396878483835, "step": 4700}, {"loss": 1.1819, "grad_norm": 0.9463635087013245, "learning_rate": 0.0002, "epoch": 5.250836120401337, "step": 4710}, {"loss": 1.1951, "grad_norm": 0.8919326663017273, "learning_rate": 0.0002, "epoch": 5.261984392419175, "step": 4720}, {"loss": 1.228, "grad_norm": 1.0364950895309448, "learning_rate": 0.0002, "epoch": 5.2731326644370125, "step": 4730}, {"loss": 1.2543, "grad_norm": 1.0225472450256348, "learning_rate": 0.0002, "epoch": 5.2842809364548495, "step": 4740}, {"loss": 1.1995, "grad_norm": 0.816410481929779, "learning_rate": 0.0002, "epoch": 5.295429208472687, "step": 4750}, {"loss": 1.3601, "grad_norm": 1.0793992280960083, "learning_rate": 0.0002, "epoch": 5.306577480490524, "step": 4760}, {"loss": 1.2424, "grad_norm": 1.0203443765640259, "learning_rate": 0.0002, "epoch": 5.317725752508361, "step": 4770}, {"loss": 1.239, "grad_norm": 1.0731306076049805, "learning_rate": 0.0002, "epoch": 5.328874024526199, "step": 4780}, {"loss": 1.2893, "grad_norm": 0.9282820224761963, "learning_rate": 0.0002, "epoch": 5.340022296544036, "step": 4790}, {"loss": 1.2159, "grad_norm": 0.9741092920303345, "learning_rate": 0.0002, "epoch": 5.351170568561873, "step": 4800}, {"loss": 1.24, "grad_norm": 1.0683609247207642, "learning_rate": 0.0002, "epoch": 5.36231884057971, "step": 4810}, {"loss": 1.2316, "grad_norm": 0.9035003781318665, "learning_rate": 0.0002, "epoch": 5.373467112597547, "step": 4820}, {"loss": 1.2615, "grad_norm": 1.0590119361877441, "learning_rate": 0.0002, "epoch": 5.384615384615385, "step": 4830}, {"loss": 1.2089, "grad_norm": 0.9782686829566956, "learning_rate": 0.0002, "epoch": 5.395763656633222, "step": 4840}, {"loss": 1.3019, "grad_norm": 1.036087155342102, "learning_rate": 0.0002, "epoch": 5.406911928651059, "step": 4850}, {"loss": 1.2475, "grad_norm": 0.9999949932098389, "learning_rate": 0.0002, "epoch": 5.418060200668896, "step": 4860}, {"loss": 1.3014, "grad_norm": 0.9094445109367371, "learning_rate": 0.0002, "epoch": 5.429208472686733, "step": 4870}, {"loss": 1.2013, "grad_norm": 0.9079708456993103, "learning_rate": 0.0002, "epoch": 5.44035674470457, "step": 4880}, {"loss": 1.2224, "grad_norm": 1.0426156520843506, "learning_rate": 0.0002, "epoch": 5.451505016722408, "step": 4890}, {"loss": 1.2812, "grad_norm": 1.0110737085342407, "learning_rate": 0.0002, "epoch": 5.4626532887402455, "step": 4900}, {"loss": 1.2178, "grad_norm": 1.0994000434875488, "learning_rate": 0.0002, "epoch": 5.4738015607580826, "step": 4910}, {"loss": 1.2019, "grad_norm": 0.8988325595855713, "learning_rate": 0.0002, "epoch": 5.48494983277592, "step": 4920}, {"loss": 1.2694, "grad_norm": 1.0705887079238892, "learning_rate": 0.0002, "epoch": 5.496098104793757, "step": 4930}, {"loss": 1.1659, "grad_norm": 1.0268803834915161, "learning_rate": 0.0002, "epoch": 5.507246376811594, "step": 4940}, {"loss": 1.2845, "grad_norm": 1.0129153728485107, "learning_rate": 0.0002, "epoch": 5.518394648829432, "step": 4950}, {"loss": 1.2081, "grad_norm": 1.122117280960083, "learning_rate": 0.0002, "epoch": 5.529542920847269, "step": 4960}, {"loss": 1.2828, "grad_norm": 1.0318635702133179, "learning_rate": 0.0002, "epoch": 5.540691192865106, "step": 4970}, {"loss": 1.2424, "grad_norm": 0.9340117573738098, "learning_rate": 0.0002, "epoch": 5.551839464882943, "step": 4980}, {"loss": 1.1541, "grad_norm": 0.9427006244659424, "learning_rate": 0.0002, "epoch": 5.56298773690078, "step": 4990}, {"loss": 1.2911, "grad_norm": 1.1786518096923828, "learning_rate": 0.0002, "epoch": 5.574136008918618, "step": 5000}, {"loss": 1.2279, "grad_norm": 1.045157551765442, "learning_rate": 0.0002, "epoch": 5.585284280936455, "step": 5010}, {"loss": 1.2269, "grad_norm": 1.0475151538848877, "learning_rate": 0.0002, "epoch": 5.596432552954292, "step": 5020}, {"loss": 1.2718, "grad_norm": 1.040969729423523, "learning_rate": 0.0002, "epoch": 5.607580824972129, "step": 5030}, {"loss": 1.2134, "grad_norm": 0.9610048532485962, "learning_rate": 0.0002, "epoch": 5.618729096989966, "step": 5040}, {"loss": 1.1657, "grad_norm": 0.9774818420410156, "learning_rate": 0.0002, "epoch": 5.6298773690078034, "step": 5050}, {"loss": 1.2788, "grad_norm": 0.8715312480926514, "learning_rate": 0.0002, "epoch": 5.641025641025641, "step": 5060}, {"loss": 1.3077, "grad_norm": 0.9484505653381348, "learning_rate": 0.0002, "epoch": 5.6521739130434785, "step": 5070}, {"loss": 1.2787, "grad_norm": 0.8292845487594604, "learning_rate": 0.0002, "epoch": 5.663322185061316, "step": 5080}, {"loss": 1.2357, "grad_norm": 0.9876886606216431, "learning_rate": 0.0002, "epoch": 5.674470457079153, "step": 5090}, {"loss": 1.2864, "grad_norm": 0.9899171590805054, "learning_rate": 0.0002, "epoch": 5.68561872909699, "step": 5100}, {"loss": 1.2747, "grad_norm": 0.9693286418914795, "learning_rate": 0.0002, "epoch": 5.696767001114827, "step": 5110}, {"loss": 1.1952, "grad_norm": 0.958905816078186, "learning_rate": 0.0002, "epoch": 5.707915273132665, "step": 5120}, {"loss": 1.2889, "grad_norm": 0.9924837350845337, "learning_rate": 0.0002, "epoch": 5.719063545150502, "step": 5130}, {"loss": 1.3057, "grad_norm": 0.9551714062690735, "learning_rate": 0.0002, "epoch": 5.730211817168339, "step": 5140}, {"loss": 1.2643, "grad_norm": 1.0407027006149292, "learning_rate": 0.0002, "epoch": 5.741360089186176, "step": 5150}, {"loss": 1.1833, "grad_norm": 0.9688791036605835, "learning_rate": 0.0002, "epoch": 5.752508361204013, "step": 5160}, {"loss": 1.1424, "grad_norm": 1.0091899633407593, "learning_rate": 0.0002, "epoch": 5.763656633221851, "step": 5170}, {"loss": 1.2575, "grad_norm": 0.9393984079360962, "learning_rate": 0.0002, "epoch": 5.774804905239688, "step": 5180}, {"loss": 1.2177, "grad_norm": 1.1439075469970703, "learning_rate": 0.0002, "epoch": 5.785953177257525, "step": 5190}, {"loss": 1.3355, "grad_norm": 1.0178622007369995, "learning_rate": 0.0002, "epoch": 5.797101449275362, "step": 5200}, {"loss": 1.3317, "grad_norm": 0.8440285921096802, "learning_rate": 0.0002, "epoch": 5.808249721293199, "step": 5210}, {"loss": 1.3097, "grad_norm": 0.856838583946228, "learning_rate": 0.0002, "epoch": 5.8193979933110365, "step": 5220}, {"loss": 1.3109, "grad_norm": 0.8676707148551941, "learning_rate": 0.0002, "epoch": 5.8305462653288735, "step": 5230}, {"loss": 1.248, "grad_norm": 1.1034743785858154, "learning_rate": 0.0002, "epoch": 5.8416945373467115, "step": 5240}, {"loss": 1.2473, "grad_norm": 0.9631003737449646, "learning_rate": 0.0002, "epoch": 5.852842809364549, "step": 5250}, {"loss": 1.2693, "grad_norm": 1.0478793382644653, "learning_rate": 0.0002, "epoch": 5.863991081382386, "step": 5260}, {"loss": 1.2349, "grad_norm": 0.9819806218147278, "learning_rate": 0.0002, "epoch": 5.875139353400223, "step": 5270}, {"loss": 1.2817, "grad_norm": 0.8572421073913574, "learning_rate": 0.0002, "epoch": 5.88628762541806, "step": 5280}, {"loss": 1.246, "grad_norm": 0.9328814148902893, "learning_rate": 0.0002, "epoch": 5.897435897435898, "step": 5290}, {"loss": 1.3016, "grad_norm": 1.000305414199829, "learning_rate": 0.0002, "epoch": 5.908584169453735, "step": 5300}, {"loss": 1.3681, "grad_norm": 1.1006377935409546, "learning_rate": 0.0002, "epoch": 5.919732441471572, "step": 5310}, {"loss": 1.3317, "grad_norm": 0.963198721408844, "learning_rate": 0.0002, "epoch": 5.930880713489409, "step": 5320}, {"loss": 1.2713, "grad_norm": 0.8952236175537109, "learning_rate": 0.0002, "epoch": 5.942028985507246, "step": 5330}, {"loss": 1.2536, "grad_norm": 1.0945496559143066, "learning_rate": 0.0002, "epoch": 5.953177257525084, "step": 5340}, {"loss": 1.2768, "grad_norm": 1.0053467750549316, "learning_rate": 0.0002, "epoch": 5.964325529542921, "step": 5350}, {"loss": 1.3075, "grad_norm": 1.032088279724121, "learning_rate": 0.0002, "epoch": 5.975473801560758, "step": 5360}, {"loss": 1.3278, "grad_norm": 1.1068958044052124, "learning_rate": 0.0002, "epoch": 5.986622073578595, "step": 5370}, {"loss": 1.2468, "grad_norm": 1.0064235925674438, "learning_rate": 0.0002, "epoch": 5.997770345596432, "step": 5380}]} +{"epoch": 7.0, "step": 6279, "epoch_duration": 1000.6824202537537, "total_accumulated_duration": 6916.459765195847, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}, {"eval_loss": 1.8143481016159058, "eval_runtime": 37.921, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 1.0, "step": 897}, {"loss": 1.8029, "grad_norm": 0.29757317900657654, "learning_rate": 0.0002, "epoch": 1.0033444816053512, "step": 900}, {"loss": 1.7376, "grad_norm": 0.32168492674827576, "learning_rate": 0.0002, "epoch": 1.0144927536231885, "step": 910}, {"loss": 1.6785, "grad_norm": 0.3430717885494232, "learning_rate": 0.0002, "epoch": 1.0256410256410255, "step": 920}, {"loss": 1.7356, "grad_norm": 0.3431745767593384, "learning_rate": 0.0002, "epoch": 1.0367892976588629, "step": 930}, {"loss": 1.7932, "grad_norm": 0.39787548780441284, "learning_rate": 0.0002, "epoch": 1.0479375696767002, "step": 940}, {"loss": 1.7434, "grad_norm": 0.3540935218334198, "learning_rate": 0.0002, "epoch": 1.0590858416945372, "step": 950}, {"loss": 1.7693, "grad_norm": 0.368484765291214, "learning_rate": 0.0002, "epoch": 1.0702341137123745, "step": 960}, {"loss": 1.6887, "grad_norm": 0.41324466466903687, "learning_rate": 0.0002, "epoch": 1.0813823857302118, "step": 970}, {"loss": 1.7288, "grad_norm": 0.3696419596672058, "learning_rate": 0.0002, "epoch": 1.0925306577480491, "step": 980}, {"loss": 1.7743, "grad_norm": 0.33832886815071106, "learning_rate": 0.0002, "epoch": 1.1036789297658862, "step": 990}, {"loss": 1.7445, "grad_norm": 0.4411991834640503, "learning_rate": 0.0002, "epoch": 1.1148272017837235, "step": 1000}, {"loss": 1.7699, "grad_norm": 0.3935333788394928, "learning_rate": 0.0002, "epoch": 1.1259754738015608, "step": 1010}, {"loss": 1.6909, "grad_norm": 0.32472893595695496, "learning_rate": 0.0002, "epoch": 1.137123745819398, "step": 1020}, {"loss": 1.6974, "grad_norm": 0.3455545902252197, "learning_rate": 0.0002, "epoch": 1.1482720178372352, "step": 1030}, {"loss": 1.7555, "grad_norm": 0.3995654582977295, "learning_rate": 0.0002, "epoch": 1.1594202898550725, "step": 1040}, {"loss": 1.7419, "grad_norm": 0.384056031703949, "learning_rate": 0.0002, "epoch": 1.1705685618729098, "step": 1050}, {"loss": 1.7693, "grad_norm": 0.4345705211162567, "learning_rate": 0.0002, "epoch": 1.1817168338907469, "step": 1060}, {"loss": 1.7219, "grad_norm": 0.3524057865142822, "learning_rate": 0.0002, "epoch": 1.1928651059085842, "step": 1070}, {"loss": 1.6701, "grad_norm": 0.4047132134437561, "learning_rate": 0.0002, "epoch": 1.2040133779264215, "step": 1080}, {"loss": 1.7035, "grad_norm": 0.365824431180954, "learning_rate": 0.0002, "epoch": 1.2151616499442586, "step": 1090}, {"loss": 1.7367, "grad_norm": 0.37048354744911194, "learning_rate": 0.0002, "epoch": 1.2263099219620959, "step": 1100}, {"loss": 1.7503, "grad_norm": 0.3753672242164612, "learning_rate": 0.0002, "epoch": 1.2374581939799332, "step": 1110}, {"loss": 1.6984, "grad_norm": 0.37887042760849, "learning_rate": 0.0002, "epoch": 1.2486064659977703, "step": 1120}, {"loss": 1.7866, "grad_norm": 0.3896579444408417, "learning_rate": 0.0002, "epoch": 1.2597547380156076, "step": 1130}, {"loss": 1.8085, "grad_norm": 0.3725394010543823, "learning_rate": 0.0002, "epoch": 1.2709030100334449, "step": 1140}, {"loss": 1.6942, "grad_norm": 0.373989999294281, "learning_rate": 0.0002, "epoch": 1.282051282051282, "step": 1150}, {"loss": 1.7566, "grad_norm": 0.4412260353565216, "learning_rate": 0.0002, "epoch": 1.2931995540691192, "step": 1160}, {"loss": 1.7425, "grad_norm": 0.38538658618927, "learning_rate": 0.0002, "epoch": 1.3043478260869565, "step": 1170}, {"loss": 1.6573, "grad_norm": 0.3644104599952698, "learning_rate": 0.0002, "epoch": 1.3154960981047936, "step": 1180}, {"loss": 1.6186, "grad_norm": 0.3615347743034363, "learning_rate": 0.0002, "epoch": 1.326644370122631, "step": 1190}, {"loss": 1.7575, "grad_norm": 0.4260489046573639, "learning_rate": 0.0002, "epoch": 1.3377926421404682, "step": 1200}, {"loss": 1.762, "grad_norm": 0.35236871242523193, "learning_rate": 0.0002, "epoch": 1.3489409141583055, "step": 1210}, {"loss": 1.7207, "grad_norm": 0.45456627011299133, "learning_rate": 0.0002, "epoch": 1.3600891861761428, "step": 1220}, {"loss": 1.7391, "grad_norm": 0.391541063785553, "learning_rate": 0.0002, "epoch": 1.37123745819398, "step": 1230}, {"loss": 1.7309, "grad_norm": 0.37955328822135925, "learning_rate": 0.0002, "epoch": 1.3823857302118172, "step": 1240}, {"loss": 1.7028, "grad_norm": 0.36955225467681885, "learning_rate": 0.0002, "epoch": 1.3935340022296545, "step": 1250}, {"loss": 1.7027, "grad_norm": 0.36156216263771057, "learning_rate": 0.0002, "epoch": 1.4046822742474916, "step": 1260}, {"loss": 1.8091, "grad_norm": 0.4083487391471863, "learning_rate": 0.0002, "epoch": 1.415830546265329, "step": 1270}, {"loss": 1.7551, "grad_norm": 0.420171320438385, "learning_rate": 0.0002, "epoch": 1.4269788182831662, "step": 1280}, {"loss": 1.7377, "grad_norm": 0.3581725060939789, "learning_rate": 0.0002, "epoch": 1.4381270903010033, "step": 1290}, {"loss": 1.728, "grad_norm": 0.3657953441143036, "learning_rate": 0.0002, "epoch": 1.4492753623188406, "step": 1300}, {"loss": 1.7116, "grad_norm": 0.3139931857585907, "learning_rate": 0.0002, "epoch": 1.4604236343366779, "step": 1310}, {"loss": 1.671, "grad_norm": 0.37750574946403503, "learning_rate": 0.0002, "epoch": 1.471571906354515, "step": 1320}, {"loss": 1.7663, "grad_norm": 0.37787437438964844, "learning_rate": 0.0002, "epoch": 1.4827201783723523, "step": 1330}, {"loss": 1.6403, "grad_norm": 0.39505279064178467, "learning_rate": 0.0002, "epoch": 1.4938684503901896, "step": 1340}, {"loss": 1.7745, "grad_norm": 0.39977672696113586, "learning_rate": 0.0002, "epoch": 1.5050167224080266, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.4395383298397064, "learning_rate": 0.0002, "epoch": 1.516164994425864, "step": 1360}, {"loss": 1.7315, "grad_norm": 0.3452998995780945, "learning_rate": 0.0002, "epoch": 1.5273132664437012, "step": 1370}, {"loss": 1.7244, "grad_norm": 0.39573904871940613, "learning_rate": 0.0002, "epoch": 1.5384615384615383, "step": 1380}, {"loss": 1.7453, "grad_norm": 0.4886358976364136, "learning_rate": 0.0002, "epoch": 1.5496098104793758, "step": 1390}, {"loss": 1.7294, "grad_norm": 0.35525891184806824, "learning_rate": 0.0002, "epoch": 1.560758082497213, "step": 1400}, {"loss": 1.6896, "grad_norm": 0.3873274028301239, "learning_rate": 0.0002, "epoch": 1.57190635451505, "step": 1410}, {"loss": 1.7545, "grad_norm": 0.35162487626075745, "learning_rate": 0.0002, "epoch": 1.5830546265328875, "step": 1420}, {"loss": 1.7403, "grad_norm": 0.3533175587654114, "learning_rate": 0.0002, "epoch": 1.5942028985507246, "step": 1430}, {"loss": 1.7199, "grad_norm": 0.35397887229919434, "learning_rate": 0.0002, "epoch": 1.605351170568562, "step": 1440}, {"loss": 1.701, "grad_norm": 0.3539091646671295, "learning_rate": 0.0002, "epoch": 1.6164994425863992, "step": 1450}, {"loss": 1.7407, "grad_norm": 0.38557013869285583, "learning_rate": 0.0002, "epoch": 1.6276477146042363, "step": 1460}, {"loss": 1.6896, "grad_norm": 0.3591409921646118, "learning_rate": 0.0002, "epoch": 1.6387959866220736, "step": 1470}, {"loss": 1.6831, "grad_norm": 0.3776722848415375, "learning_rate": 0.0002, "epoch": 1.649944258639911, "step": 1480}, {"loss": 1.7511, "grad_norm": 0.3761521875858307, "learning_rate": 0.0002, "epoch": 1.661092530657748, "step": 1490}, {"loss": 1.7464, "grad_norm": 0.33939364552497864, "learning_rate": 0.0002, "epoch": 1.6722408026755853, "step": 1500}, {"loss": 1.6522, "grad_norm": 0.3961067795753479, "learning_rate": 0.0002, "epoch": 1.6833890746934226, "step": 1510}, {"loss": 1.7849, "grad_norm": 0.36793094873428345, "learning_rate": 0.0002, "epoch": 1.6945373467112597, "step": 1520}, {"loss": 1.7057, "grad_norm": 0.4201025068759918, "learning_rate": 0.0002, "epoch": 1.705685618729097, "step": 1530}, {"loss": 1.6656, "grad_norm": 0.382280558347702, "learning_rate": 0.0002, "epoch": 1.7168338907469343, "step": 1540}, {"loss": 1.7987, "grad_norm": 0.4504372477531433, "learning_rate": 0.0002, "epoch": 1.7279821627647713, "step": 1550}, {"loss": 1.7889, "grad_norm": 0.36121585965156555, "learning_rate": 0.0002, "epoch": 1.7391304347826086, "step": 1560}, {"loss": 1.7282, "grad_norm": 0.38416755199432373, "learning_rate": 0.0002, "epoch": 1.750278706800446, "step": 1570}, {"loss": 1.7759, "grad_norm": 0.3920411467552185, "learning_rate": 0.0002, "epoch": 1.761426978818283, "step": 1580}, {"loss": 1.7693, "grad_norm": 0.4326777756214142, "learning_rate": 0.0002, "epoch": 1.7725752508361206, "step": 1590}, {"loss": 1.6804, "grad_norm": 0.3582489490509033, "learning_rate": 0.0002, "epoch": 1.7837235228539576, "step": 1600}, {"loss": 1.706, "grad_norm": 0.36345767974853516, "learning_rate": 0.0002, "epoch": 1.7948717948717947, "step": 1610}, {"loss": 1.75, "grad_norm": 0.3951990008354187, "learning_rate": 0.0002, "epoch": 1.8060200668896322, "step": 1620}, {"loss": 1.8034, "grad_norm": 0.35174235701560974, "learning_rate": 0.0002, "epoch": 1.8171683389074693, "step": 1630}, {"loss": 1.725, "grad_norm": 0.37005263566970825, "learning_rate": 0.0002, "epoch": 1.8283166109253066, "step": 1640}, {"loss": 1.695, "grad_norm": 0.42875173687934875, "learning_rate": 0.0002, "epoch": 1.839464882943144, "step": 1650}, {"loss": 1.7589, "grad_norm": 0.3646032512187958, "learning_rate": 0.0002, "epoch": 1.850613154960981, "step": 1660}, {"loss": 1.6698, "grad_norm": 0.38111618161201477, "learning_rate": 0.0002, "epoch": 1.8617614269788183, "step": 1670}, {"loss": 1.7832, "grad_norm": 0.3825555443763733, "learning_rate": 0.0002, "epoch": 1.8729096989966556, "step": 1680}, {"loss": 1.7599, "grad_norm": 0.36418095231056213, "learning_rate": 0.0002, "epoch": 1.8840579710144927, "step": 1690}, {"loss": 1.6532, "grad_norm": 0.36551007628440857, "learning_rate": 0.0002, "epoch": 1.89520624303233, "step": 1700}, {"loss": 1.7174, "grad_norm": 0.36421480774879456, "learning_rate": 0.0002, "epoch": 1.9063545150501673, "step": 1710}, {"loss": 1.7176, "grad_norm": 0.3791242241859436, "learning_rate": 0.0002, "epoch": 1.9175027870680044, "step": 1720}, {"loss": 1.7961, "grad_norm": 0.36655193567276, "learning_rate": 0.0002, "epoch": 1.9286510590858417, "step": 1730}, {"loss": 1.7765, "grad_norm": 0.3526945412158966, "learning_rate": 0.0002, "epoch": 1.939799331103679, "step": 1740}, {"loss": 1.7047, "grad_norm": 0.41139861941337585, "learning_rate": 0.0002, "epoch": 1.950947603121516, "step": 1750}, {"loss": 1.8155, "grad_norm": 0.41757065057754517, "learning_rate": 0.0002, "epoch": 1.9620958751393534, "step": 1760}, {"loss": 1.7271, "grad_norm": 0.38956186175346375, "learning_rate": 0.0002, "epoch": 1.9732441471571907, "step": 1770}, {"loss": 1.7653, "grad_norm": 0.33891627192497253, "learning_rate": 0.0002, "epoch": 1.9843924191750277, "step": 1780}, {"loss": 1.7305, "grad_norm": 0.42879191040992737, "learning_rate": 0.0002, "epoch": 1.9955406911928653, "step": 1790}, {"eval_loss": 1.8116765022277832, "eval_runtime": 37.9859, "eval_samples_per_second": 13.558, "eval_steps_per_second": 1.711, "epoch": 2.0, "step": 1794}, {"loss": 1.6724, "grad_norm": 0.42103368043899536, "learning_rate": 0.0002, "epoch": 2.0066889632107023, "step": 1800}, {"loss": 1.5812, "grad_norm": 0.41505053639411926, "learning_rate": 0.0002, "epoch": 2.0178372352285394, "step": 1810}, {"loss": 1.6132, "grad_norm": 0.398190438747406, "learning_rate": 0.0002, "epoch": 2.028985507246377, "step": 1820}, {"loss": 1.6497, "grad_norm": 0.4371621310710907, "learning_rate": 0.0002, "epoch": 2.040133779264214, "step": 1830}, {"loss": 1.6501, "grad_norm": 0.45679208636283875, "learning_rate": 0.0002, "epoch": 2.051282051282051, "step": 1840}, {"loss": 1.5773, "grad_norm": 0.43211811780929565, "learning_rate": 0.0002, "epoch": 2.0624303232998886, "step": 1850}, {"loss": 1.6414, "grad_norm": 0.47492915391921997, "learning_rate": 0.0002, "epoch": 2.0735785953177257, "step": 1860}, {"loss": 1.7169, "grad_norm": 0.41742339730262756, "learning_rate": 0.0002, "epoch": 2.084726867335563, "step": 1870}, {"loss": 1.5762, "grad_norm": 0.45789217948913574, "learning_rate": 0.0002, "epoch": 2.0958751393534003, "step": 1880}, {"loss": 1.6896, "grad_norm": 0.43958935141563416, "learning_rate": 0.0002, "epoch": 2.1070234113712374, "step": 1890}, {"loss": 1.6444, "grad_norm": 0.43991968035697937, "learning_rate": 0.0002, "epoch": 2.1181716833890745, "step": 1900}, {"loss": 1.6057, "grad_norm": 0.4667953848838806, "learning_rate": 0.0002, "epoch": 2.129319955406912, "step": 1910}, {"loss": 1.5999, "grad_norm": 0.42225760221481323, "learning_rate": 0.0002, "epoch": 2.140468227424749, "step": 1920}, {"loss": 1.6525, "grad_norm": 0.418850839138031, "learning_rate": 0.0002, "epoch": 2.1516164994425866, "step": 1930}, {"loss": 1.6091, "grad_norm": 0.43838515877723694, "learning_rate": 0.0002, "epoch": 2.1627647714604237, "step": 1940}, {"loss": 1.6837, "grad_norm": 0.43798115849494934, "learning_rate": 0.0002, "epoch": 2.1739130434782608, "step": 1950}, {"loss": 1.632, "grad_norm": 0.4456610679626465, "learning_rate": 0.0002, "epoch": 2.1850613154960983, "step": 1960}, {"loss": 1.6338, "grad_norm": 0.4619026482105255, "learning_rate": 0.0002, "epoch": 2.1962095875139354, "step": 1970}, {"loss": 1.6989, "grad_norm": 0.4732453525066376, "learning_rate": 0.0002, "epoch": 2.2073578595317724, "step": 1980}, {"loss": 1.581, "grad_norm": 0.42551836371421814, "learning_rate": 0.0002, "epoch": 2.21850613154961, "step": 1990}, {"loss": 1.6386, "grad_norm": 0.45154353976249695, "learning_rate": 0.0002, "epoch": 2.229654403567447, "step": 2000}, {"loss": 1.6768, "grad_norm": 0.4655696451663971, "learning_rate": 0.0002, "epoch": 2.240802675585284, "step": 2010}, {"loss": 1.6972, "grad_norm": 0.5363447666168213, "learning_rate": 0.0002, "epoch": 2.2519509476031216, "step": 2020}, {"loss": 1.6561, "grad_norm": 0.4839927852153778, "learning_rate": 0.0002, "epoch": 2.2630992196209587, "step": 2030}, {"loss": 1.6838, "grad_norm": 0.4639221727848053, "learning_rate": 0.0002, "epoch": 2.274247491638796, "step": 2040}, {"loss": 1.6063, "grad_norm": 0.46169278025627136, "learning_rate": 0.0002, "epoch": 2.2853957636566333, "step": 2050}, {"loss": 1.5924, "grad_norm": 0.4582304060459137, "learning_rate": 0.0002, "epoch": 2.2965440356744704, "step": 2060}, {"loss": 1.5778, "grad_norm": 0.48619818687438965, "learning_rate": 0.0002, "epoch": 2.3076923076923075, "step": 2070}, {"loss": 1.633, "grad_norm": 0.4382200241088867, "learning_rate": 0.0002, "epoch": 2.318840579710145, "step": 2080}, {"loss": 1.5854, "grad_norm": 0.4103265106678009, "learning_rate": 0.0002, "epoch": 2.329988851727982, "step": 2090}, {"loss": 1.7042, "grad_norm": 0.5136023759841919, "learning_rate": 0.0002, "epoch": 2.3411371237458196, "step": 2100}, {"loss": 1.5723, "grad_norm": 0.46723702549934387, "learning_rate": 0.0002, "epoch": 2.3522853957636567, "step": 2110}, {"loss": 1.6852, "grad_norm": 0.42269468307495117, "learning_rate": 0.0002, "epoch": 2.3634336677814938, "step": 2120}, {"loss": 1.6369, "grad_norm": 0.42611163854599, "learning_rate": 0.0002, "epoch": 2.374581939799331, "step": 2130}, {"loss": 1.5879, "grad_norm": 0.4573901891708374, "learning_rate": 0.0002, "epoch": 2.3857302118171684, "step": 2140}, {"loss": 1.6317, "grad_norm": 0.4758673310279846, "learning_rate": 0.0002, "epoch": 2.3968784838350055, "step": 2150}, {"loss": 1.6527, "grad_norm": 0.49616846442222595, "learning_rate": 0.0002, "epoch": 2.408026755852843, "step": 2160}, {"loss": 1.5796, "grad_norm": 0.5278240442276001, "learning_rate": 0.0002, "epoch": 2.41917502787068, "step": 2170}, {"loss": 1.6746, "grad_norm": 0.46806028485298157, "learning_rate": 0.0002, "epoch": 2.430323299888517, "step": 2180}, {"loss": 1.676, "grad_norm": 0.44507312774658203, "learning_rate": 0.0002, "epoch": 2.4414715719063547, "step": 2190}, {"loss": 1.6793, "grad_norm": 0.45716050267219543, "learning_rate": 0.0002, "epoch": 2.4526198439241917, "step": 2200}, {"loss": 1.6198, "grad_norm": 0.4226573705673218, "learning_rate": 0.0002, "epoch": 2.463768115942029, "step": 2210}, {"loss": 1.5721, "grad_norm": 0.4488418400287628, "learning_rate": 0.0002, "epoch": 2.4749163879598663, "step": 2220}, {"loss": 1.6399, "grad_norm": 0.48324450850486755, "learning_rate": 0.0002, "epoch": 2.4860646599777034, "step": 2230}, {"loss": 1.6228, "grad_norm": 0.4866982400417328, "learning_rate": 0.0002, "epoch": 2.4972129319955405, "step": 2240}, {"loss": 1.6887, "grad_norm": 0.4784172773361206, "learning_rate": 0.0002, "epoch": 2.508361204013378, "step": 2250}, {"loss": 1.6905, "grad_norm": 0.4250621199607849, "learning_rate": 0.0002, "epoch": 2.519509476031215, "step": 2260}, {"loss": 1.6582, "grad_norm": 0.431224524974823, "learning_rate": 0.0002, "epoch": 2.5306577480490526, "step": 2270}, {"loss": 1.5981, "grad_norm": 0.3931371867656708, "learning_rate": 0.0002, "epoch": 2.5418060200668897, "step": 2280}, {"loss": 1.6897, "grad_norm": 0.4800887703895569, "learning_rate": 0.0002, "epoch": 2.552954292084727, "step": 2290}, {"loss": 1.6205, "grad_norm": 0.4288487136363983, "learning_rate": 0.0002, "epoch": 2.564102564102564, "step": 2300}, {"loss": 1.6005, "grad_norm": 0.48489660024642944, "learning_rate": 0.0002, "epoch": 2.5752508361204014, "step": 2310}, {"loss": 1.6447, "grad_norm": 0.4221740961074829, "learning_rate": 0.0002, "epoch": 2.5863991081382385, "step": 2320}, {"loss": 1.666, "grad_norm": 0.4413852393627167, "learning_rate": 0.0002, "epoch": 2.597547380156076, "step": 2330}, {"loss": 1.6863, "grad_norm": 0.4391345679759979, "learning_rate": 0.0002, "epoch": 2.608695652173913, "step": 2340}, {"loss": 1.6942, "grad_norm": 0.4824720323085785, "learning_rate": 0.0002, "epoch": 2.61984392419175, "step": 2350}, {"loss": 1.5615, "grad_norm": 0.4023158550262451, "learning_rate": 0.0002, "epoch": 2.6309921962095872, "step": 2360}, {"loss": 1.698, "grad_norm": 0.5107841491699219, "learning_rate": 0.0002, "epoch": 2.6421404682274248, "step": 2370}, {"loss": 1.6258, "grad_norm": 0.4705312252044678, "learning_rate": 0.0002, "epoch": 2.653288740245262, "step": 2380}, {"loss": 1.7294, "grad_norm": 0.4420899450778961, "learning_rate": 0.0002, "epoch": 2.6644370122630994, "step": 2390}, {"loss": 1.6246, "grad_norm": 0.413308709859848, "learning_rate": 0.0002, "epoch": 2.6755852842809364, "step": 2400}, {"loss": 1.565, "grad_norm": 0.4312658905982971, "learning_rate": 0.0002, "epoch": 2.6867335562987735, "step": 2410}, {"loss": 1.617, "grad_norm": 0.44714513421058655, "learning_rate": 0.0002, "epoch": 2.697881828316611, "step": 2420}, {"loss": 1.6185, "grad_norm": 0.49152931571006775, "learning_rate": 0.0002, "epoch": 2.709030100334448, "step": 2430}, {"loss": 1.5864, "grad_norm": 0.49458765983581543, "learning_rate": 0.0002, "epoch": 2.7201783723522857, "step": 2440}, {"loss": 1.6535, "grad_norm": 0.47838348150253296, "learning_rate": 0.0002, "epoch": 2.7313266443701227, "step": 2450}, {"loss": 1.6836, "grad_norm": 0.5781240463256836, "learning_rate": 0.0002, "epoch": 2.74247491638796, "step": 2460}, {"loss": 1.6141, "grad_norm": 0.4559851884841919, "learning_rate": 0.0002, "epoch": 2.753623188405797, "step": 2470}, {"loss": 1.5589, "grad_norm": 0.4452647566795349, "learning_rate": 0.0002, "epoch": 2.7647714604236344, "step": 2480}, {"loss": 1.6209, "grad_norm": 0.43920454382896423, "learning_rate": 0.0002, "epoch": 2.7759197324414715, "step": 2490}, {"loss": 1.5593, "grad_norm": 0.467780739068985, "learning_rate": 0.0002, "epoch": 2.787068004459309, "step": 2500}, {"loss": 1.6438, "grad_norm": 0.4743262529373169, "learning_rate": 0.0002, "epoch": 2.798216276477146, "step": 2510}, {"loss": 1.6084, "grad_norm": 0.47944432497024536, "learning_rate": 0.0002, "epoch": 2.809364548494983, "step": 2520}, {"loss": 1.6756, "grad_norm": 0.48032790422439575, "learning_rate": 0.0002, "epoch": 2.8205128205128203, "step": 2530}, {"loss": 1.6222, "grad_norm": 0.45569729804992676, "learning_rate": 0.0002, "epoch": 2.831661092530658, "step": 2540}, {"loss": 1.6187, "grad_norm": 0.47940587997436523, "learning_rate": 0.0002, "epoch": 2.842809364548495, "step": 2550}, {"loss": 1.6286, "grad_norm": 0.5215432047843933, "learning_rate": 0.0002, "epoch": 2.8539576365663324, "step": 2560}, {"loss": 1.6718, "grad_norm": 0.4421178102493286, "learning_rate": 0.0002, "epoch": 2.8651059085841695, "step": 2570}, {"loss": 1.6201, "grad_norm": 0.45288747549057007, "learning_rate": 0.0002, "epoch": 2.8762541806020065, "step": 2580}, {"loss": 1.5938, "grad_norm": 0.4472251832485199, "learning_rate": 0.0002, "epoch": 2.887402452619844, "step": 2590}, {"loss": 1.7212, "grad_norm": 0.4396503269672394, "learning_rate": 0.0002, "epoch": 2.898550724637681, "step": 2600}, {"loss": 1.6503, "grad_norm": 0.48590990900993347, "learning_rate": 0.0002, "epoch": 2.9096989966555182, "step": 2610}, {"loss": 1.5914, "grad_norm": 0.4787760376930237, "learning_rate": 0.0002, "epoch": 2.9208472686733558, "step": 2620}, {"loss": 1.717, "grad_norm": 0.4807611107826233, "learning_rate": 0.0002, "epoch": 2.931995540691193, "step": 2630}, {"loss": 1.6794, "grad_norm": 0.4625583291053772, "learning_rate": 0.0002, "epoch": 2.94314381270903, "step": 2640}, {"loss": 1.663, "grad_norm": 0.4163573980331421, "learning_rate": 0.0002, "epoch": 2.9542920847268674, "step": 2650}, {"loss": 1.6321, "grad_norm": 0.5142832398414612, "learning_rate": 0.0002, "epoch": 2.9654403567447045, "step": 2660}, {"loss": 1.6183, "grad_norm": 0.4459492564201355, "learning_rate": 0.0002, "epoch": 2.976588628762542, "step": 2670}, {"loss": 1.662, "grad_norm": 0.42905503511428833, "learning_rate": 0.0002, "epoch": 2.987736900780379, "step": 2680}, {"loss": 1.6796, "grad_norm": 0.44594648480415344, "learning_rate": 0.0002, "epoch": 2.998885172798216, "step": 2690}, {"eval_loss": 1.8300215005874634, "eval_runtime": 38.0349, "eval_samples_per_second": 13.54, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 2691}, {"loss": 1.5768, "grad_norm": 0.4742245078086853, "learning_rate": 0.0002, "epoch": 3.0100334448160537, "step": 2700}, {"loss": 1.4859, "grad_norm": 0.5157448649406433, "learning_rate": 0.0002, "epoch": 3.021181716833891, "step": 2710}, {"loss": 1.4219, "grad_norm": 0.5634726285934448, "learning_rate": 0.0002, "epoch": 3.032329988851728, "step": 2720}, {"loss": 1.5452, "grad_norm": 0.4554799199104309, "learning_rate": 0.0002, "epoch": 3.0434782608695654, "step": 2730}, {"loss": 1.4784, "grad_norm": 0.6565208435058594, "learning_rate": 0.0002, "epoch": 3.0546265328874025, "step": 2740}, {"loss": 1.459, "grad_norm": 0.6174370050430298, "learning_rate": 0.0002, "epoch": 3.0657748049052396, "step": 2750}, {"loss": 1.469, "grad_norm": 0.4987483024597168, "learning_rate": 0.0002, "epoch": 3.076923076923077, "step": 2760}, {"loss": 1.5466, "grad_norm": 0.5810927152633667, "learning_rate": 0.0002, "epoch": 3.088071348940914, "step": 2770}, {"loss": 1.4936, "grad_norm": 0.5281634330749512, "learning_rate": 0.0002, "epoch": 3.0992196209587513, "step": 2780}, {"loss": 1.4751, "grad_norm": 0.5479053854942322, "learning_rate": 0.0002, "epoch": 3.1103678929765888, "step": 2790}, {"loss": 1.5601, "grad_norm": 0.6192978620529175, "learning_rate": 0.0002, "epoch": 3.121516164994426, "step": 2800}, {"loss": 1.4888, "grad_norm": 0.560117781162262, "learning_rate": 0.0002, "epoch": 3.132664437012263, "step": 2810}, {"loss": 1.5495, "grad_norm": 0.6067224740982056, "learning_rate": 0.0002, "epoch": 3.1438127090301005, "step": 2820}, {"loss": 1.5239, "grad_norm": 0.611287534236908, "learning_rate": 0.0002, "epoch": 3.1549609810479375, "step": 2830}, {"loss": 1.4577, "grad_norm": 0.6441587209701538, "learning_rate": 0.0002, "epoch": 3.1661092530657746, "step": 2840}, {"loss": 1.5322, "grad_norm": 0.5955114364624023, "learning_rate": 0.0002, "epoch": 3.177257525083612, "step": 2850}, {"loss": 1.5222, "grad_norm": 0.5554782748222351, "learning_rate": 0.0002, "epoch": 3.1884057971014492, "step": 2860}, {"loss": 1.4676, "grad_norm": 0.5411370992660522, "learning_rate": 0.0002, "epoch": 3.1995540691192863, "step": 2870}, {"loss": 1.5008, "grad_norm": 0.6152016520500183, "learning_rate": 0.0002, "epoch": 3.210702341137124, "step": 2880}, {"loss": 1.5229, "grad_norm": 0.5711581110954285, "learning_rate": 0.0002, "epoch": 3.221850613154961, "step": 2890}, {"loss": 1.5255, "grad_norm": 0.5399307012557983, "learning_rate": 0.0002, "epoch": 3.2329988851727984, "step": 2900}, {"loss": 1.4888, "grad_norm": 0.60606849193573, "learning_rate": 0.0002, "epoch": 3.2441471571906355, "step": 2910}, {"loss": 1.5056, "grad_norm": 0.5873523950576782, "learning_rate": 0.0002, "epoch": 3.2552954292084726, "step": 2920}, {"loss": 1.5208, "grad_norm": 0.6149439215660095, "learning_rate": 0.0002, "epoch": 3.26644370122631, "step": 2930}, {"loss": 1.4942, "grad_norm": 0.5940659046173096, "learning_rate": 0.0002, "epoch": 3.277591973244147, "step": 2940}, {"loss": 1.5031, "grad_norm": 0.6846756339073181, "learning_rate": 0.0002, "epoch": 3.2887402452619843, "step": 2950}, {"loss": 1.5425, "grad_norm": 0.6708254218101501, "learning_rate": 0.0002, "epoch": 3.299888517279822, "step": 2960}, {"loss": 1.5319, "grad_norm": 0.5966503620147705, "learning_rate": 0.0002, "epoch": 3.311036789297659, "step": 2970}, {"loss": 1.5173, "grad_norm": 0.6328812837600708, "learning_rate": 0.0002, "epoch": 3.322185061315496, "step": 2980}, {"loss": 1.5096, "grad_norm": 0.6082745790481567, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2990}, {"loss": 1.5122, "grad_norm": 0.6207539439201355, "learning_rate": 0.0002, "epoch": 3.3444816053511706, "step": 3000}, {"loss": 1.5053, "grad_norm": 0.5501793026924133, "learning_rate": 0.0002, "epoch": 3.3556298773690076, "step": 3010}, {"loss": 1.4428, "grad_norm": 0.571275532245636, "learning_rate": 0.0002, "epoch": 3.366778149386845, "step": 3020}, {"loss": 1.5914, "grad_norm": 0.7003518342971802, "learning_rate": 0.0002, "epoch": 3.3779264214046822, "step": 3030}, {"loss": 1.5359, "grad_norm": 0.609527587890625, "learning_rate": 0.0002, "epoch": 3.3890746934225193, "step": 3040}, {"loss": 1.5072, "grad_norm": 0.5880036354064941, "learning_rate": 0.0002, "epoch": 3.400222965440357, "step": 3050}, {"loss": 1.5451, "grad_norm": 0.5847334265708923, "learning_rate": 0.0002, "epoch": 3.411371237458194, "step": 3060}, {"loss": 1.4738, "grad_norm": 0.5373924970626831, "learning_rate": 0.0002, "epoch": 3.4225195094760315, "step": 3070}, {"loss": 1.5215, "grad_norm": 0.6074833869934082, "learning_rate": 0.0002, "epoch": 3.4336677814938685, "step": 3080}, {"loss": 1.458, "grad_norm": 0.5118414163589478, "learning_rate": 0.0002, "epoch": 3.4448160535117056, "step": 3090}, {"loss": 1.5006, "grad_norm": 0.5577956438064575, "learning_rate": 0.0002, "epoch": 3.4559643255295427, "step": 3100}, {"loss": 1.5057, "grad_norm": 0.5654811859130859, "learning_rate": 0.0002, "epoch": 3.46711259754738, "step": 3110}, {"loss": 1.523, "grad_norm": 0.6216017603874207, "learning_rate": 0.0002, "epoch": 3.4782608695652173, "step": 3120}, {"loss": 1.5292, "grad_norm": 0.5983642339706421, "learning_rate": 0.0002, "epoch": 3.489409141583055, "step": 3130}, {"loss": 1.5568, "grad_norm": 0.6635708212852478, "learning_rate": 0.0002, "epoch": 3.500557413600892, "step": 3140}, {"loss": 1.4633, "grad_norm": 0.6254258751869202, "learning_rate": 0.0002, "epoch": 3.511705685618729, "step": 3150}, {"loss": 1.4934, "grad_norm": 0.6359851360321045, "learning_rate": 0.0002, "epoch": 3.522853957636566, "step": 3160}, {"loss": 1.4693, "grad_norm": 0.5938616394996643, "learning_rate": 0.0002, "epoch": 3.5340022296544036, "step": 3170}, {"loss": 1.4393, "grad_norm": 0.6360630393028259, "learning_rate": 0.0002, "epoch": 3.5451505016722407, "step": 3180}, {"loss": 1.5535, "grad_norm": 0.6097670197486877, "learning_rate": 0.0002, "epoch": 3.556298773690078, "step": 3190}, {"loss": 1.5427, "grad_norm": 0.5984025597572327, "learning_rate": 0.0002, "epoch": 3.5674470457079153, "step": 3200}, {"loss": 1.4741, "grad_norm": 0.5463748574256897, "learning_rate": 0.0002, "epoch": 3.5785953177257523, "step": 3210}, {"loss": 1.513, "grad_norm": 1.0017699003219604, "learning_rate": 0.0002, "epoch": 3.58974358974359, "step": 3220}, {"loss": 1.5687, "grad_norm": 0.6519441604614258, "learning_rate": 0.0002, "epoch": 3.600891861761427, "step": 3230}, {"loss": 1.5168, "grad_norm": 0.6457271575927734, "learning_rate": 0.0002, "epoch": 3.6120401337792645, "step": 3240}, {"loss": 1.5511, "grad_norm": 0.5898868441581726, "learning_rate": 0.0002, "epoch": 3.6231884057971016, "step": 3250}, {"loss": 1.5833, "grad_norm": 0.6612270474433899, "learning_rate": 0.0002, "epoch": 3.6343366778149386, "step": 3260}, {"loss": 1.4537, "grad_norm": 0.5102090239524841, "learning_rate": 0.0002, "epoch": 3.6454849498327757, "step": 3270}, {"loss": 1.4676, "grad_norm": 0.5357231497764587, "learning_rate": 0.0002, "epoch": 3.6566332218506132, "step": 3280}, {"loss": 1.5417, "grad_norm": 0.6176130175590515, "learning_rate": 0.0002, "epoch": 3.6677814938684503, "step": 3290}, {"loss": 1.5057, "grad_norm": 0.6384354829788208, "learning_rate": 0.0002, "epoch": 3.678929765886288, "step": 3300}, {"loss": 1.5973, "grad_norm": 0.5493269562721252, "learning_rate": 0.0002, "epoch": 3.690078037904125, "step": 3310}, {"loss": 1.5958, "grad_norm": 0.5721797943115234, "learning_rate": 0.0002, "epoch": 3.701226309921962, "step": 3320}, {"loss": 1.5098, "grad_norm": 0.6667633056640625, "learning_rate": 0.0002, "epoch": 3.712374581939799, "step": 3330}, {"loss": 1.5372, "grad_norm": 0.5713372826576233, "learning_rate": 0.0002, "epoch": 3.7235228539576366, "step": 3340}, {"loss": 1.5959, "grad_norm": 0.5925018191337585, "learning_rate": 0.0002, "epoch": 3.7346711259754737, "step": 3350}, {"loss": 1.5045, "grad_norm": 0.5660955905914307, "learning_rate": 0.0002, "epoch": 3.745819397993311, "step": 3360}, {"loss": 1.5465, "grad_norm": 0.5470759868621826, "learning_rate": 0.0002, "epoch": 3.7569676700111483, "step": 3370}, {"loss": 1.547, "grad_norm": 0.7612935900688171, "learning_rate": 0.0002, "epoch": 3.7681159420289854, "step": 3380}, {"loss": 1.6224, "grad_norm": 0.577467679977417, "learning_rate": 0.0002, "epoch": 3.779264214046823, "step": 3390}, {"loss": 1.5653, "grad_norm": 0.6125091910362244, "learning_rate": 0.0002, "epoch": 3.79041248606466, "step": 3400}, {"loss": 1.5463, "grad_norm": 0.590386152267456, "learning_rate": 0.0002, "epoch": 3.801560758082497, "step": 3410}, {"loss": 1.5944, "grad_norm": 0.5530361533164978, "learning_rate": 0.0002, "epoch": 3.8127090301003346, "step": 3420}, {"loss": 1.4797, "grad_norm": 0.5714079737663269, "learning_rate": 0.0002, "epoch": 3.8238573021181717, "step": 3430}, {"loss": 1.5324, "grad_norm": 0.9061086773872375, "learning_rate": 0.0002, "epoch": 3.8350055741360087, "step": 3440}, {"loss": 1.4513, "grad_norm": 0.6193320751190186, "learning_rate": 0.0002, "epoch": 3.8461538461538463, "step": 3450}, {"loss": 1.5537, "grad_norm": 0.5831704139709473, "learning_rate": 0.0002, "epoch": 3.8573021181716833, "step": 3460}, {"loss": 1.5144, "grad_norm": 0.5971192717552185, "learning_rate": 0.0002, "epoch": 3.868450390189521, "step": 3470}, {"loss": 1.484, "grad_norm": 0.6110154390335083, "learning_rate": 0.0002, "epoch": 3.879598662207358, "step": 3480}, {"loss": 1.5624, "grad_norm": 0.6644453406333923, "learning_rate": 0.0002, "epoch": 3.890746934225195, "step": 3490}, {"loss": 1.5422, "grad_norm": 0.6674908399581909, "learning_rate": 0.0002, "epoch": 3.901895206243032, "step": 3500}, {"loss": 1.579, "grad_norm": 0.5516519546508789, "learning_rate": 0.0002, "epoch": 3.9130434782608696, "step": 3510}, {"loss": 1.5964, "grad_norm": 0.6704319715499878, "learning_rate": 0.0002, "epoch": 3.9241917502787067, "step": 3520}, {"loss": 1.515, "grad_norm": 0.5820314288139343, "learning_rate": 0.0002, "epoch": 3.9353400222965442, "step": 3530}, {"loss": 1.6458, "grad_norm": 0.6931548714637756, "learning_rate": 0.0002, "epoch": 3.9464882943143813, "step": 3540}, {"loss": 1.5338, "grad_norm": 0.6085171103477478, "learning_rate": 0.0002, "epoch": 3.9576365663322184, "step": 3550}, {"loss": 1.5537, "grad_norm": 0.5973535776138306, "learning_rate": 0.0002, "epoch": 3.9687848383500555, "step": 3560}, {"loss": 1.5435, "grad_norm": 0.49761658906936646, "learning_rate": 0.0002, "epoch": 3.979933110367893, "step": 3570}, {"loss": 1.488, "grad_norm": 0.6282512545585632, "learning_rate": 0.0002, "epoch": 3.99108138238573, "step": 3580}, {"eval_loss": 1.8790398836135864, "eval_runtime": 37.9725, "eval_samples_per_second": 13.562, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 3588}, {"loss": 1.5025, "grad_norm": 0.6402973532676697, "learning_rate": 0.0002, "epoch": 4.002229654403568, "step": 3590}, {"loss": 1.3695, "grad_norm": 0.7791030406951904, "learning_rate": 0.0002, "epoch": 4.013377926421405, "step": 3600}, {"loss": 1.3545, "grad_norm": 0.7136624455451965, "learning_rate": 0.0002, "epoch": 4.024526198439242, "step": 3610}, {"loss": 1.3515, "grad_norm": 0.7608486413955688, "learning_rate": 0.0002, "epoch": 4.035674470457079, "step": 3620}, {"loss": 1.3067, "grad_norm": 0.7486591935157776, "learning_rate": 0.0002, "epoch": 4.046822742474917, "step": 3630}, {"loss": 1.3474, "grad_norm": 0.7576302289962769, "learning_rate": 0.0002, "epoch": 4.057971014492754, "step": 3640}, {"loss": 1.3036, "grad_norm": 0.7358254194259644, "learning_rate": 0.0002, "epoch": 4.069119286510591, "step": 3650}, {"loss": 1.3015, "grad_norm": 0.821326494216919, "learning_rate": 0.0002, "epoch": 4.080267558528428, "step": 3660}, {"loss": 1.4186, "grad_norm": 0.7996482253074646, "learning_rate": 0.0002, "epoch": 4.091415830546265, "step": 3670}, {"loss": 1.3671, "grad_norm": 0.8527022004127502, "learning_rate": 0.0002, "epoch": 4.102564102564102, "step": 3680}, {"loss": 1.3818, "grad_norm": 0.7313576340675354, "learning_rate": 0.0002, "epoch": 4.11371237458194, "step": 3690}, {"loss": 1.3307, "grad_norm": 0.7854588627815247, "learning_rate": 0.0002, "epoch": 4.124860646599777, "step": 3700}, {"loss": 1.4174, "grad_norm": 0.6588303446769714, "learning_rate": 0.0002, "epoch": 4.136008918617614, "step": 3710}, {"loss": 1.3674, "grad_norm": 0.7986254692077637, "learning_rate": 0.0002, "epoch": 4.147157190635451, "step": 3720}, {"loss": 1.3505, "grad_norm": 0.6864156126976013, "learning_rate": 0.0002, "epoch": 4.1583054626532885, "step": 3730}, {"loss": 1.2987, "grad_norm": 0.8197885155677795, "learning_rate": 0.0002, "epoch": 4.169453734671126, "step": 3740}, {"loss": 1.3565, "grad_norm": 0.7169402837753296, "learning_rate": 0.0002, "epoch": 4.1806020066889635, "step": 3750}, {"loss": 1.4388, "grad_norm": 0.7948839068412781, "learning_rate": 0.0002, "epoch": 4.191750278706801, "step": 3760}, {"loss": 1.4648, "grad_norm": 0.6775302290916443, "learning_rate": 0.0002, "epoch": 4.202898550724638, "step": 3770}, {"loss": 1.3238, "grad_norm": 0.8913543820381165, "learning_rate": 0.0002, "epoch": 4.214046822742475, "step": 3780}, {"loss": 1.4251, "grad_norm": 0.8046368360519409, "learning_rate": 0.0002, "epoch": 4.225195094760312, "step": 3790}, {"loss": 1.3542, "grad_norm": 0.9359563589096069, "learning_rate": 0.0002, "epoch": 4.236343366778149, "step": 3800}, {"loss": 1.3963, "grad_norm": 0.8012228608131409, "learning_rate": 0.0002, "epoch": 4.247491638795987, "step": 3810}, {"loss": 1.311, "grad_norm": 0.8405851125717163, "learning_rate": 0.0002, "epoch": 4.258639910813824, "step": 3820}, {"loss": 1.3903, "grad_norm": 0.7812899351119995, "learning_rate": 0.0002, "epoch": 4.269788182831661, "step": 3830}, {"loss": 1.4006, "grad_norm": 0.8192463517189026, "learning_rate": 0.0002, "epoch": 4.280936454849498, "step": 3840}, {"loss": 1.3663, "grad_norm": 0.6937220096588135, "learning_rate": 0.0002, "epoch": 4.292084726867335, "step": 3850}, {"loss": 1.391, "grad_norm": 0.7245703935623169, "learning_rate": 0.0002, "epoch": 4.303232998885173, "step": 3860}, {"loss": 1.3351, "grad_norm": 0.7816787362098694, "learning_rate": 0.0002, "epoch": 4.31438127090301, "step": 3870}, {"loss": 1.4316, "grad_norm": 0.7904975414276123, "learning_rate": 0.0002, "epoch": 4.325529542920847, "step": 3880}, {"loss": 1.4722, "grad_norm": 1.0394847393035889, "learning_rate": 0.0002, "epoch": 4.336677814938684, "step": 3890}, {"loss": 1.4574, "grad_norm": 0.7044078707695007, "learning_rate": 0.0002, "epoch": 4.3478260869565215, "step": 3900}, {"loss": 1.3185, "grad_norm": 0.8852819204330444, "learning_rate": 0.0002, "epoch": 4.358974358974359, "step": 3910}, {"loss": 1.3664, "grad_norm": 0.7712758779525757, "learning_rate": 0.0002, "epoch": 4.3701226309921966, "step": 3920}, {"loss": 1.3519, "grad_norm": 0.7677774429321289, "learning_rate": 0.0002, "epoch": 4.381270903010034, "step": 3930}, {"loss": 1.3693, "grad_norm": 0.7450921535491943, "learning_rate": 0.0002, "epoch": 4.392419175027871, "step": 3940}, {"loss": 1.392, "grad_norm": 0.7802795767784119, "learning_rate": 0.0002, "epoch": 4.403567447045708, "step": 3950}, {"loss": 1.3661, "grad_norm": 0.8976508378982544, "learning_rate": 0.0002, "epoch": 4.414715719063545, "step": 3960}, {"loss": 1.4124, "grad_norm": 0.8148922324180603, "learning_rate": 0.0002, "epoch": 4.425863991081382, "step": 3970}, {"loss": 1.3937, "grad_norm": 0.7490504384040833, "learning_rate": 0.0002, "epoch": 4.43701226309922, "step": 3980}, {"loss": 1.393, "grad_norm": 0.753652036190033, "learning_rate": 0.0002, "epoch": 4.448160535117057, "step": 3990}, {"loss": 1.3467, "grad_norm": 0.803986668586731, "learning_rate": 0.0002, "epoch": 4.459308807134894, "step": 4000}, {"loss": 1.3872, "grad_norm": 0.8643081784248352, "learning_rate": 0.0002, "epoch": 4.470457079152731, "step": 4010}, {"loss": 1.407, "grad_norm": 0.8298280835151672, "learning_rate": 0.0002, "epoch": 4.481605351170568, "step": 4020}, {"loss": 1.4555, "grad_norm": 0.705355703830719, "learning_rate": 0.0002, "epoch": 4.492753623188406, "step": 4030}, {"loss": 1.3646, "grad_norm": 0.7845711708068848, "learning_rate": 0.0002, "epoch": 4.503901895206243, "step": 4040}, {"loss": 1.3913, "grad_norm": 0.8056256175041199, "learning_rate": 0.0002, "epoch": 4.51505016722408, "step": 4050}, {"loss": 1.3716, "grad_norm": 0.7080171704292297, "learning_rate": 0.0002, "epoch": 4.5261984392419174, "step": 4060}, {"loss": 1.335, "grad_norm": 0.778388261795044, "learning_rate": 0.0002, "epoch": 4.5373467112597545, "step": 4070}, {"loss": 1.3921, "grad_norm": 0.7337639927864075, "learning_rate": 0.0002, "epoch": 4.548494983277592, "step": 4080}, {"loss": 1.369, "grad_norm": 0.815322756767273, "learning_rate": 0.0002, "epoch": 4.55964325529543, "step": 4090}, {"loss": 1.4509, "grad_norm": 0.8817179203033447, "learning_rate": 0.0002, "epoch": 4.570791527313267, "step": 4100}, {"loss": 1.344, "grad_norm": 0.7526060342788696, "learning_rate": 0.0002, "epoch": 4.581939799331104, "step": 4110}, {"loss": 1.4027, "grad_norm": 0.920465350151062, "learning_rate": 0.0002, "epoch": 4.593088071348941, "step": 4120}, {"loss": 1.3757, "grad_norm": 0.7509559392929077, "learning_rate": 0.0002, "epoch": 4.604236343366778, "step": 4130}, {"loss": 1.4064, "grad_norm": 0.799469530582428, "learning_rate": 0.0002, "epoch": 4.615384615384615, "step": 4140}, {"loss": 1.3689, "grad_norm": 0.8099892735481262, "learning_rate": 0.0002, "epoch": 4.626532887402453, "step": 4150}, {"loss": 1.3689, "grad_norm": 0.7790375351905823, "learning_rate": 0.0002, "epoch": 4.63768115942029, "step": 4160}, {"loss": 1.4626, "grad_norm": 0.8292977809906006, "learning_rate": 0.0002, "epoch": 4.648829431438127, "step": 4170}, {"loss": 1.4505, "grad_norm": 0.8312386274337769, "learning_rate": 0.0002, "epoch": 4.659977703455964, "step": 4180}, {"loss": 1.4301, "grad_norm": 0.7348753809928894, "learning_rate": 0.0002, "epoch": 4.671125975473801, "step": 4190}, {"loss": 1.4074, "grad_norm": 0.8006551265716553, "learning_rate": 0.0002, "epoch": 4.682274247491639, "step": 4200}, {"loss": 1.4349, "grad_norm": 0.8477752804756165, "learning_rate": 0.0002, "epoch": 4.693422519509476, "step": 4210}, {"loss": 1.3943, "grad_norm": 0.7056546211242676, "learning_rate": 0.0002, "epoch": 4.704570791527313, "step": 4220}, {"loss": 1.3415, "grad_norm": 0.7858873009681702, "learning_rate": 0.0002, "epoch": 4.7157190635451505, "step": 4230}, {"loss": 1.3644, "grad_norm": 0.6968740224838257, "learning_rate": 0.0002, "epoch": 4.7268673355629875, "step": 4240}, {"loss": 1.3594, "grad_norm": 0.7886689901351929, "learning_rate": 0.0002, "epoch": 4.738015607580825, "step": 4250}, {"loss": 1.3783, "grad_norm": 0.8935304880142212, "learning_rate": 0.0002, "epoch": 4.749163879598662, "step": 4260}, {"loss": 1.3664, "grad_norm": 0.8395553231239319, "learning_rate": 0.0002, "epoch": 4.7603121516165, "step": 4270}, {"loss": 1.4113, "grad_norm": 0.817263126373291, "learning_rate": 0.0002, "epoch": 4.771460423634337, "step": 4280}, {"loss": 1.4181, "grad_norm": 0.7912008166313171, "learning_rate": 0.0002, "epoch": 4.782608695652174, "step": 4290}, {"loss": 1.4369, "grad_norm": 0.6637866497039795, "learning_rate": 0.0002, "epoch": 4.793756967670011, "step": 4300}, {"loss": 1.4328, "grad_norm": 1.0709338188171387, "learning_rate": 0.0002, "epoch": 4.804905239687848, "step": 4310}, {"loss": 1.4635, "grad_norm": 0.8179698586463928, "learning_rate": 0.0002, "epoch": 4.816053511705686, "step": 4320}, {"loss": 1.3757, "grad_norm": 0.7952052354812622, "learning_rate": 0.0002, "epoch": 4.827201783723523, "step": 4330}, {"loss": 1.3954, "grad_norm": 0.7235367894172668, "learning_rate": 0.0002, "epoch": 4.83835005574136, "step": 4340}, {"loss": 1.4668, "grad_norm": 0.8484606742858887, "learning_rate": 0.0002, "epoch": 4.849498327759197, "step": 4350}, {"loss": 1.3898, "grad_norm": 0.7344942092895508, "learning_rate": 0.0002, "epoch": 4.860646599777034, "step": 4360}, {"loss": 1.4519, "grad_norm": 0.9718546867370605, "learning_rate": 0.0002, "epoch": 4.871794871794872, "step": 4370}, {"loss": 1.4187, "grad_norm": 0.8174259066581726, "learning_rate": 0.0002, "epoch": 4.882943143812709, "step": 4380}, {"loss": 1.3244, "grad_norm": 0.8097165822982788, "learning_rate": 0.0002, "epoch": 4.894091415830546, "step": 4390}, {"loss": 1.3689, "grad_norm": 0.756388783454895, "learning_rate": 0.0002, "epoch": 4.9052396878483835, "step": 4400}, {"loss": 1.4129, "grad_norm": 0.8324617743492126, "learning_rate": 0.0002, "epoch": 4.916387959866221, "step": 4410}, {"loss": 1.3662, "grad_norm": 0.8949803709983826, "learning_rate": 0.0002, "epoch": 4.927536231884058, "step": 4420}, {"loss": 1.4632, "grad_norm": 0.7663722634315491, "learning_rate": 0.0002, "epoch": 4.938684503901895, "step": 4430}, {"loss": 1.3829, "grad_norm": 0.7727946043014526, "learning_rate": 0.0002, "epoch": 4.949832775919733, "step": 4440}, {"loss": 1.4351, "grad_norm": 0.6872350573539734, "learning_rate": 0.0002, "epoch": 4.96098104793757, "step": 4450}, {"loss": 1.4552, "grad_norm": 0.754357099533081, "learning_rate": 0.0002, "epoch": 4.972129319955407, "step": 4460}, {"loss": 1.4, "grad_norm": 0.8068729639053345, "learning_rate": 0.0002, "epoch": 4.983277591973244, "step": 4470}, {"loss": 1.3891, "grad_norm": 0.8200556635856628, "learning_rate": 0.0002, "epoch": 4.994425863991081, "step": 4480}, {"eval_loss": 1.9543706178665161, "eval_runtime": 37.9369, "eval_samples_per_second": 13.575, "eval_steps_per_second": 1.713, "epoch": 5.0, "step": 4485}, {"loss": 1.3194, "grad_norm": 0.7499465942382812, "learning_rate": 0.0002, "epoch": 5.005574136008919, "step": 4490}, {"loss": 1.2143, "grad_norm": 1.030434489250183, "learning_rate": 0.0002, "epoch": 5.016722408026756, "step": 4500}, {"loss": 1.2408, "grad_norm": 0.8914631605148315, "learning_rate": 0.0002, "epoch": 5.027870680044593, "step": 4510}, {"loss": 1.1448, "grad_norm": 0.9902928471565247, "learning_rate": 0.0002, "epoch": 5.03901895206243, "step": 4520}, {"loss": 1.2401, "grad_norm": 0.8338701128959656, "learning_rate": 0.0002, "epoch": 5.050167224080267, "step": 4530}, {"loss": 1.1952, "grad_norm": 0.9440169334411621, "learning_rate": 0.0002, "epoch": 5.061315496098104, "step": 4540}, {"loss": 1.2196, "grad_norm": 0.8755099177360535, "learning_rate": 0.0002, "epoch": 5.072463768115942, "step": 4550}, {"loss": 1.1806, "grad_norm": 0.9145820140838623, "learning_rate": 0.0002, "epoch": 5.083612040133779, "step": 4560}, {"loss": 1.147, "grad_norm": 1.0068492889404297, "learning_rate": 0.0002, "epoch": 5.0947603121516165, "step": 4570}, {"loss": 1.2192, "grad_norm": 0.9184673428535461, "learning_rate": 0.0002, "epoch": 5.105908584169454, "step": 4580}, {"loss": 1.2948, "grad_norm": 1.1158655881881714, "learning_rate": 0.0002, "epoch": 5.117056856187291, "step": 4590}, {"loss": 1.2423, "grad_norm": 0.9685078263282776, "learning_rate": 0.0002, "epoch": 5.128205128205128, "step": 4600}, {"loss": 1.2654, "grad_norm": 1.0389559268951416, "learning_rate": 0.0002, "epoch": 5.139353400222966, "step": 4610}, {"loss": 1.1965, "grad_norm": 1.0294485092163086, "learning_rate": 0.0002, "epoch": 5.150501672240803, "step": 4620}, {"loss": 1.296, "grad_norm": 0.9368783235549927, "learning_rate": 0.0002, "epoch": 5.16164994425864, "step": 4630}, {"loss": 1.206, "grad_norm": 0.9724945425987244, "learning_rate": 0.0002, "epoch": 5.172798216276477, "step": 4640}, {"loss": 1.2319, "grad_norm": 0.876488447189331, "learning_rate": 0.0002, "epoch": 5.183946488294314, "step": 4650}, {"loss": 1.2506, "grad_norm": 0.9106290340423584, "learning_rate": 0.0002, "epoch": 5.195094760312152, "step": 4660}, {"loss": 1.2896, "grad_norm": 1.0924615859985352, "learning_rate": 0.0002, "epoch": 5.206243032329989, "step": 4670}, {"loss": 1.245, "grad_norm": 1.0379078388214111, "learning_rate": 0.0002, "epoch": 5.217391304347826, "step": 4680}, {"loss": 1.2155, "grad_norm": 0.9507831931114197, "learning_rate": 0.0002, "epoch": 5.228539576365663, "step": 4690}, {"loss": 1.2318, "grad_norm": 1.0408620834350586, "learning_rate": 0.0002, "epoch": 5.2396878483835, "step": 4700}, {"loss": 1.1819, "grad_norm": 0.9463635087013245, "learning_rate": 0.0002, "epoch": 5.250836120401337, "step": 4710}, {"loss": 1.1951, "grad_norm": 0.8919326663017273, "learning_rate": 0.0002, "epoch": 5.261984392419175, "step": 4720}, {"loss": 1.228, "grad_norm": 1.0364950895309448, "learning_rate": 0.0002, "epoch": 5.2731326644370125, "step": 4730}, {"loss": 1.2543, "grad_norm": 1.0225472450256348, "learning_rate": 0.0002, "epoch": 5.2842809364548495, "step": 4740}, {"loss": 1.1995, "grad_norm": 0.816410481929779, "learning_rate": 0.0002, "epoch": 5.295429208472687, "step": 4750}, {"loss": 1.3601, "grad_norm": 1.0793992280960083, "learning_rate": 0.0002, "epoch": 5.306577480490524, "step": 4760}, {"loss": 1.2424, "grad_norm": 1.0203443765640259, "learning_rate": 0.0002, "epoch": 5.317725752508361, "step": 4770}, {"loss": 1.239, "grad_norm": 1.0731306076049805, "learning_rate": 0.0002, "epoch": 5.328874024526199, "step": 4780}, {"loss": 1.2893, "grad_norm": 0.9282820224761963, "learning_rate": 0.0002, "epoch": 5.340022296544036, "step": 4790}, {"loss": 1.2159, "grad_norm": 0.9741092920303345, "learning_rate": 0.0002, "epoch": 5.351170568561873, "step": 4800}, {"loss": 1.24, "grad_norm": 1.0683609247207642, "learning_rate": 0.0002, "epoch": 5.36231884057971, "step": 4810}, {"loss": 1.2316, "grad_norm": 0.9035003781318665, "learning_rate": 0.0002, "epoch": 5.373467112597547, "step": 4820}, {"loss": 1.2615, "grad_norm": 1.0590119361877441, "learning_rate": 0.0002, "epoch": 5.384615384615385, "step": 4830}, {"loss": 1.2089, "grad_norm": 0.9782686829566956, "learning_rate": 0.0002, "epoch": 5.395763656633222, "step": 4840}, {"loss": 1.3019, "grad_norm": 1.036087155342102, "learning_rate": 0.0002, "epoch": 5.406911928651059, "step": 4850}, {"loss": 1.2475, "grad_norm": 0.9999949932098389, "learning_rate": 0.0002, "epoch": 5.418060200668896, "step": 4860}, {"loss": 1.3014, "grad_norm": 0.9094445109367371, "learning_rate": 0.0002, "epoch": 5.429208472686733, "step": 4870}, {"loss": 1.2013, "grad_norm": 0.9079708456993103, "learning_rate": 0.0002, "epoch": 5.44035674470457, "step": 4880}, {"loss": 1.2224, "grad_norm": 1.0426156520843506, "learning_rate": 0.0002, "epoch": 5.451505016722408, "step": 4890}, {"loss": 1.2812, "grad_norm": 1.0110737085342407, "learning_rate": 0.0002, "epoch": 5.4626532887402455, "step": 4900}, {"loss": 1.2178, "grad_norm": 1.0994000434875488, "learning_rate": 0.0002, "epoch": 5.4738015607580826, "step": 4910}, {"loss": 1.2019, "grad_norm": 0.8988325595855713, "learning_rate": 0.0002, "epoch": 5.48494983277592, "step": 4920}, {"loss": 1.2694, "grad_norm": 1.0705887079238892, "learning_rate": 0.0002, "epoch": 5.496098104793757, "step": 4930}, {"loss": 1.1659, "grad_norm": 1.0268803834915161, "learning_rate": 0.0002, "epoch": 5.507246376811594, "step": 4940}, {"loss": 1.2845, "grad_norm": 1.0129153728485107, "learning_rate": 0.0002, "epoch": 5.518394648829432, "step": 4950}, {"loss": 1.2081, "grad_norm": 1.122117280960083, "learning_rate": 0.0002, "epoch": 5.529542920847269, "step": 4960}, {"loss": 1.2828, "grad_norm": 1.0318635702133179, "learning_rate": 0.0002, "epoch": 5.540691192865106, "step": 4970}, {"loss": 1.2424, "grad_norm": 0.9340117573738098, "learning_rate": 0.0002, "epoch": 5.551839464882943, "step": 4980}, {"loss": 1.1541, "grad_norm": 0.9427006244659424, "learning_rate": 0.0002, "epoch": 5.56298773690078, "step": 4990}, {"loss": 1.2911, "grad_norm": 1.1786518096923828, "learning_rate": 0.0002, "epoch": 5.574136008918618, "step": 5000}, {"loss": 1.2279, "grad_norm": 1.045157551765442, "learning_rate": 0.0002, "epoch": 5.585284280936455, "step": 5010}, {"loss": 1.2269, "grad_norm": 1.0475151538848877, "learning_rate": 0.0002, "epoch": 5.596432552954292, "step": 5020}, {"loss": 1.2718, "grad_norm": 1.040969729423523, "learning_rate": 0.0002, "epoch": 5.607580824972129, "step": 5030}, {"loss": 1.2134, "grad_norm": 0.9610048532485962, "learning_rate": 0.0002, "epoch": 5.618729096989966, "step": 5040}, {"loss": 1.1657, "grad_norm": 0.9774818420410156, "learning_rate": 0.0002, "epoch": 5.6298773690078034, "step": 5050}, {"loss": 1.2788, "grad_norm": 0.8715312480926514, "learning_rate": 0.0002, "epoch": 5.641025641025641, "step": 5060}, {"loss": 1.3077, "grad_norm": 0.9484505653381348, "learning_rate": 0.0002, "epoch": 5.6521739130434785, "step": 5070}, {"loss": 1.2787, "grad_norm": 0.8292845487594604, "learning_rate": 0.0002, "epoch": 5.663322185061316, "step": 5080}, {"loss": 1.2357, "grad_norm": 0.9876886606216431, "learning_rate": 0.0002, "epoch": 5.674470457079153, "step": 5090}, {"loss": 1.2864, "grad_norm": 0.9899171590805054, "learning_rate": 0.0002, "epoch": 5.68561872909699, "step": 5100}, {"loss": 1.2747, "grad_norm": 0.9693286418914795, "learning_rate": 0.0002, "epoch": 5.696767001114827, "step": 5110}, {"loss": 1.1952, "grad_norm": 0.958905816078186, "learning_rate": 0.0002, "epoch": 5.707915273132665, "step": 5120}, {"loss": 1.2889, "grad_norm": 0.9924837350845337, "learning_rate": 0.0002, "epoch": 5.719063545150502, "step": 5130}, {"loss": 1.3057, "grad_norm": 0.9551714062690735, "learning_rate": 0.0002, "epoch": 5.730211817168339, "step": 5140}, {"loss": 1.2643, "grad_norm": 1.0407027006149292, "learning_rate": 0.0002, "epoch": 5.741360089186176, "step": 5150}, {"loss": 1.1833, "grad_norm": 0.9688791036605835, "learning_rate": 0.0002, "epoch": 5.752508361204013, "step": 5160}, {"loss": 1.1424, "grad_norm": 1.0091899633407593, "learning_rate": 0.0002, "epoch": 5.763656633221851, "step": 5170}, {"loss": 1.2575, "grad_norm": 0.9393984079360962, "learning_rate": 0.0002, "epoch": 5.774804905239688, "step": 5180}, {"loss": 1.2177, "grad_norm": 1.1439075469970703, "learning_rate": 0.0002, "epoch": 5.785953177257525, "step": 5190}, {"loss": 1.3355, "grad_norm": 1.0178622007369995, "learning_rate": 0.0002, "epoch": 5.797101449275362, "step": 5200}, {"loss": 1.3317, "grad_norm": 0.8440285921096802, "learning_rate": 0.0002, "epoch": 5.808249721293199, "step": 5210}, {"loss": 1.3097, "grad_norm": 0.856838583946228, "learning_rate": 0.0002, "epoch": 5.8193979933110365, "step": 5220}, {"loss": 1.3109, "grad_norm": 0.8676707148551941, "learning_rate": 0.0002, "epoch": 5.8305462653288735, "step": 5230}, {"loss": 1.248, "grad_norm": 1.1034743785858154, "learning_rate": 0.0002, "epoch": 5.8416945373467115, "step": 5240}, {"loss": 1.2473, "grad_norm": 0.9631003737449646, "learning_rate": 0.0002, "epoch": 5.852842809364549, "step": 5250}, {"loss": 1.2693, "grad_norm": 1.0478793382644653, "learning_rate": 0.0002, "epoch": 5.863991081382386, "step": 5260}, {"loss": 1.2349, "grad_norm": 0.9819806218147278, "learning_rate": 0.0002, "epoch": 5.875139353400223, "step": 5270}, {"loss": 1.2817, "grad_norm": 0.8572421073913574, "learning_rate": 0.0002, "epoch": 5.88628762541806, "step": 5280}, {"loss": 1.246, "grad_norm": 0.9328814148902893, "learning_rate": 0.0002, "epoch": 5.897435897435898, "step": 5290}, {"loss": 1.3016, "grad_norm": 1.000305414199829, "learning_rate": 0.0002, "epoch": 5.908584169453735, "step": 5300}, {"loss": 1.3681, "grad_norm": 1.1006377935409546, "learning_rate": 0.0002, "epoch": 5.919732441471572, "step": 5310}, {"loss": 1.3317, "grad_norm": 0.963198721408844, "learning_rate": 0.0002, "epoch": 5.930880713489409, "step": 5320}, {"loss": 1.2713, "grad_norm": 0.8952236175537109, "learning_rate": 0.0002, "epoch": 5.942028985507246, "step": 5330}, {"loss": 1.2536, "grad_norm": 1.0945496559143066, "learning_rate": 0.0002, "epoch": 5.953177257525084, "step": 5340}, {"loss": 1.2768, "grad_norm": 1.0053467750549316, "learning_rate": 0.0002, "epoch": 5.964325529542921, "step": 5350}, {"loss": 1.3075, "grad_norm": 1.032088279724121, "learning_rate": 0.0002, "epoch": 5.975473801560758, "step": 5360}, {"loss": 1.3278, "grad_norm": 1.1068958044052124, "learning_rate": 0.0002, "epoch": 5.986622073578595, "step": 5370}, {"loss": 1.2468, "grad_norm": 1.0064235925674438, "learning_rate": 0.0002, "epoch": 5.997770345596432, "step": 5380}, {"eval_loss": 2.0690135955810547, "eval_runtime": 38.1748, "eval_samples_per_second": 13.491, "eval_steps_per_second": 1.703, "epoch": 6.0, "step": 5382}, {"loss": 1.1062, "grad_norm": 0.9700132608413696, "learning_rate": 0.0002, "epoch": 6.0089186176142695, "step": 5390}, {"loss": 1.097, "grad_norm": 1.159369707107544, "learning_rate": 0.0002, "epoch": 6.0200668896321075, "step": 5400}, {"loss": 1.0646, "grad_norm": 1.332871913909912, "learning_rate": 0.0002, "epoch": 6.0312151616499445, "step": 5410}, {"loss": 1.0882, "grad_norm": 1.2239890098571777, "learning_rate": 0.0002, "epoch": 6.042363433667782, "step": 5420}, {"loss": 1.0505, "grad_norm": 1.5238478183746338, "learning_rate": 0.0002, "epoch": 6.053511705685619, "step": 5430}, {"loss": 1.1423, "grad_norm": 1.24699068069458, "learning_rate": 0.0002, "epoch": 6.064659977703456, "step": 5440}, {"loss": 1.0789, "grad_norm": 1.0891860723495483, "learning_rate": 0.0002, "epoch": 6.075808249721293, "step": 5450}, {"loss": 1.1439, "grad_norm": 1.2695465087890625, "learning_rate": 0.0002, "epoch": 6.086956521739131, "step": 5460}, {"loss": 1.0728, "grad_norm": 1.0630067586898804, "learning_rate": 0.0002, "epoch": 6.098104793756968, "step": 5470}, {"loss": 1.0391, "grad_norm": 0.9666808247566223, "learning_rate": 0.0002, "epoch": 6.109253065774805, "step": 5480}, {"loss": 1.1159, "grad_norm": 0.8925976157188416, "learning_rate": 0.0002, "epoch": 6.120401337792642, "step": 5490}, {"loss": 1.0371, "grad_norm": 1.0824475288391113, "learning_rate": 0.0002, "epoch": 6.131549609810479, "step": 5500}, {"loss": 1.1568, "grad_norm": 1.2315316200256348, "learning_rate": 0.0002, "epoch": 6.142697881828316, "step": 5510}, {"loss": 1.0896, "grad_norm": 1.2484779357910156, "learning_rate": 0.0002, "epoch": 6.153846153846154, "step": 5520}, {"loss": 1.0368, "grad_norm": 1.2468485832214355, "learning_rate": 0.0002, "epoch": 6.164994425863991, "step": 5530}, {"loss": 1.1368, "grad_norm": 1.0837156772613525, "learning_rate": 0.0002, "epoch": 6.176142697881828, "step": 5540}, {"loss": 1.1042, "grad_norm": 1.1650336980819702, "learning_rate": 0.0002, "epoch": 6.187290969899665, "step": 5550}, {"loss": 1.0495, "grad_norm": 1.2004241943359375, "learning_rate": 0.0002, "epoch": 6.1984392419175025, "step": 5560}, {"loss": 1.023, "grad_norm": 1.0223793983459473, "learning_rate": 0.0002, "epoch": 6.20958751393534, "step": 5570}, {"loss": 1.0837, "grad_norm": 1.4045847654342651, "learning_rate": 0.0002, "epoch": 6.2207357859531776, "step": 5580}, {"loss": 1.1168, "grad_norm": 1.3042256832122803, "learning_rate": 0.0002, "epoch": 6.231884057971015, "step": 5590}, {"loss": 1.0138, "grad_norm": 1.1762887239456177, "learning_rate": 0.0002, "epoch": 6.243032329988852, "step": 5600}, {"loss": 1.1651, "grad_norm": 1.1739851236343384, "learning_rate": 0.0002, "epoch": 6.254180602006689, "step": 5610}, {"loss": 1.1004, "grad_norm": 1.2904260158538818, "learning_rate": 0.0002, "epoch": 6.265328874024526, "step": 5620}, {"loss": 1.0803, "grad_norm": 1.3218393325805664, "learning_rate": 0.0002, "epoch": 6.276477146042364, "step": 5630}, {"loss": 1.0876, "grad_norm": 1.241175889968872, "learning_rate": 0.0002, "epoch": 6.287625418060201, "step": 5640}, {"loss": 1.128, "grad_norm": 1.2916349172592163, "learning_rate": 0.0002, "epoch": 6.298773690078038, "step": 5650}, {"loss": 1.1197, "grad_norm": 1.5129448175430298, "learning_rate": 0.0002, "epoch": 6.309921962095875, "step": 5660}, {"loss": 1.0723, "grad_norm": 1.0297393798828125, "learning_rate": 0.0002, "epoch": 6.321070234113712, "step": 5670}, {"loss": 1.0513, "grad_norm": 1.1127521991729736, "learning_rate": 0.0002, "epoch": 6.332218506131549, "step": 5680}, {"loss": 1.0305, "grad_norm": 1.0972518920898438, "learning_rate": 0.0002, "epoch": 6.343366778149387, "step": 5690}, {"loss": 1.0616, "grad_norm": 1.4237337112426758, "learning_rate": 0.0002, "epoch": 6.354515050167224, "step": 5700}, {"loss": 1.0924, "grad_norm": 1.121502161026001, "learning_rate": 0.0002, "epoch": 6.365663322185061, "step": 5710}, {"loss": 1.0208, "grad_norm": 1.1007202863693237, "learning_rate": 0.0002, "epoch": 6.3768115942028984, "step": 5720}, {"loss": 1.1178, "grad_norm": 1.1609363555908203, "learning_rate": 0.0002, "epoch": 6.3879598662207355, "step": 5730}, {"loss": 1.1068, "grad_norm": 1.3008915185928345, "learning_rate": 0.0002, "epoch": 6.399108138238573, "step": 5740}, {"loss": 1.1647, "grad_norm": 1.184460163116455, "learning_rate": 0.0002, "epoch": 6.410256410256411, "step": 5750}, {"loss": 1.109, "grad_norm": 1.2092398405075073, "learning_rate": 0.0002, "epoch": 6.421404682274248, "step": 5760}, {"loss": 1.093, "grad_norm": 1.2273279428482056, "learning_rate": 0.0002, "epoch": 6.432552954292085, "step": 5770}, {"loss": 1.1171, "grad_norm": 1.0721677541732788, "learning_rate": 0.0002, "epoch": 6.443701226309922, "step": 5780}, {"loss": 1.0585, "grad_norm": 1.1679279804229736, "learning_rate": 0.0002, "epoch": 6.454849498327759, "step": 5790}, {"loss": 1.0795, "grad_norm": 1.3658736944198608, "learning_rate": 0.0002, "epoch": 6.465997770345597, "step": 5800}, {"loss": 1.0951, "grad_norm": 1.2440944910049438, "learning_rate": 0.0002, "epoch": 6.477146042363434, "step": 5810}, {"loss": 1.0815, "grad_norm": 1.1838182210922241, "learning_rate": 0.0002, "epoch": 6.488294314381271, "step": 5820}, {"loss": 1.0543, "grad_norm": 1.1993956565856934, "learning_rate": 0.0002, "epoch": 6.499442586399108, "step": 5830}, {"loss": 1.1587, "grad_norm": 1.1028285026550293, "learning_rate": 0.0002, "epoch": 6.510590858416945, "step": 5840}, {"loss": 1.1245, "grad_norm": 1.2117441892623901, "learning_rate": 0.0002, "epoch": 6.521739130434782, "step": 5850}, {"loss": 1.1237, "grad_norm": 1.2012946605682373, "learning_rate": 0.0002, "epoch": 6.53288740245262, "step": 5860}, {"loss": 1.1038, "grad_norm": 1.2491029500961304, "learning_rate": 0.0002, "epoch": 6.544035674470457, "step": 5870}, {"loss": 1.1183, "grad_norm": 1.4130326509475708, "learning_rate": 0.0002, "epoch": 6.555183946488294, "step": 5880}, {"loss": 1.1094, "grad_norm": 1.2596930265426636, "learning_rate": 0.0002, "epoch": 6.5663322185061315, "step": 5890}, {"loss": 1.1445, "grad_norm": 1.32266104221344, "learning_rate": 0.0002, "epoch": 6.5774804905239685, "step": 5900}, {"loss": 1.169, "grad_norm": 1.3093374967575073, "learning_rate": 0.0002, "epoch": 6.588628762541806, "step": 5910}, {"loss": 1.161, "grad_norm": 1.0436453819274902, "learning_rate": 0.0002, "epoch": 6.599777034559644, "step": 5920}, {"loss": 1.1358, "grad_norm": 1.064468502998352, "learning_rate": 0.0002, "epoch": 6.610925306577481, "step": 5930}, {"loss": 1.1443, "grad_norm": 1.2561777830123901, "learning_rate": 0.0002, "epoch": 6.622073578595318, "step": 5940}, {"loss": 1.1088, "grad_norm": 1.2759621143341064, "learning_rate": 0.0002, "epoch": 6.633221850613155, "step": 5950}, {"loss": 1.1103, "grad_norm": 1.0602868795394897, "learning_rate": 0.0002, "epoch": 6.644370122630992, "step": 5960}, {"loss": 1.2081, "grad_norm": 1.2336751222610474, "learning_rate": 0.0002, "epoch": 6.65551839464883, "step": 5970}, {"loss": 1.1264, "grad_norm": 1.1773011684417725, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 5980}, {"loss": 1.1641, "grad_norm": 1.0779681205749512, "learning_rate": 0.0002, "epoch": 6.677814938684504, "step": 5990}, {"loss": 1.1034, "grad_norm": 1.396223783493042, "learning_rate": 0.0002, "epoch": 6.688963210702341, "step": 6000}, {"loss": 1.1418, "grad_norm": 1.2238768339157104, "learning_rate": 0.0002, "epoch": 6.700111482720178, "step": 6010}, {"loss": 1.098, "grad_norm": 1.1152666807174683, "learning_rate": 0.0002, "epoch": 6.711259754738015, "step": 6020}, {"loss": 1.1602, "grad_norm": 1.2376031875610352, "learning_rate": 0.0002, "epoch": 6.722408026755852, "step": 6030}, {"loss": 1.1062, "grad_norm": 1.0868488550186157, "learning_rate": 0.0002, "epoch": 6.73355629877369, "step": 6040}, {"loss": 1.1366, "grad_norm": 1.265913724899292, "learning_rate": 0.0002, "epoch": 6.744704570791527, "step": 6050}, {"loss": 1.0959, "grad_norm": 1.1551072597503662, "learning_rate": 0.0002, "epoch": 6.7558528428093645, "step": 6060}, {"loss": 1.1395, "grad_norm": 1.0813109874725342, "learning_rate": 0.0002, "epoch": 6.767001114827202, "step": 6070}, {"loss": 1.1047, "grad_norm": 1.2367933988571167, "learning_rate": 0.0002, "epoch": 6.778149386845039, "step": 6080}, {"loss": 1.0803, "grad_norm": 1.1612437963485718, "learning_rate": 0.0002, "epoch": 6.789297658862877, "step": 6090}, {"loss": 1.1462, "grad_norm": 1.2715837955474854, "learning_rate": 0.0002, "epoch": 6.800445930880714, "step": 6100}, {"loss": 1.1371, "grad_norm": 1.1385036706924438, "learning_rate": 0.0002, "epoch": 6.811594202898551, "step": 6110}, {"loss": 1.137, "grad_norm": 1.4322341680526733, "learning_rate": 0.0002, "epoch": 6.822742474916388, "step": 6120}, {"loss": 1.1571, "grad_norm": 1.2975877523422241, "learning_rate": 0.0002, "epoch": 6.833890746934225, "step": 6130}, {"loss": 1.1592, "grad_norm": 1.0241044759750366, "learning_rate": 0.0002, "epoch": 6.845039018952063, "step": 6140}, {"loss": 1.1369, "grad_norm": 1.352594017982483, "learning_rate": 0.0002, "epoch": 6.8561872909699, "step": 6150}, {"loss": 1.112, "grad_norm": 1.1166167259216309, "learning_rate": 0.0002, "epoch": 6.867335562987737, "step": 6160}, {"loss": 1.1409, "grad_norm": 1.1596941947937012, "learning_rate": 0.0002, "epoch": 6.878483835005574, "step": 6170}, {"loss": 1.1258, "grad_norm": 1.5753912925720215, "learning_rate": 0.0002, "epoch": 6.889632107023411, "step": 6180}, {"loss": 1.1154, "grad_norm": 1.1857494115829468, "learning_rate": 0.0002, "epoch": 6.900780379041248, "step": 6190}, {"loss": 1.137, "grad_norm": 1.1507896184921265, "learning_rate": 0.0002, "epoch": 6.911928651059085, "step": 6200}, {"loss": 1.1532, "grad_norm": 1.5194647312164307, "learning_rate": 0.0002, "epoch": 6.923076923076923, "step": 6210}, {"loss": 1.1315, "grad_norm": 1.1627732515335083, "learning_rate": 0.0002, "epoch": 6.93422519509476, "step": 6220}, {"loss": 1.1079, "grad_norm": 1.1929609775543213, "learning_rate": 0.0002, "epoch": 6.9453734671125975, "step": 6230}, {"loss": 1.1331, "grad_norm": 1.2704664468765259, "learning_rate": 0.0002, "epoch": 6.956521739130435, "step": 6240}, {"loss": 1.1177, "grad_norm": 1.1791198253631592, "learning_rate": 0.0002, "epoch": 6.967670011148272, "step": 6250}, {"loss": 1.1152, "grad_norm": 1.1948790550231934, "learning_rate": 0.0002, "epoch": 6.97881828316611, "step": 6260}, {"loss": 1.1213, "grad_norm": 1.222116231918335, "learning_rate": 0.0002, "epoch": 6.989966555183947, "step": 6270}]} +{"epoch": 8.0, "step": 7176, "epoch_duration": 976.1707360744476, "total_accumulated_duration": 7892.630501270294, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-1/checkpoint-1794", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5946, "grad_norm": 0.4864582419395447, "learning_rate": 0.0002, "epoch": 0.011148272017837236, "step": 10}, {"loss": 2.2959, "grad_norm": 0.6151555776596069, "learning_rate": 0.0002, "epoch": 0.022296544035674472, "step": 20}, {"loss": 2.008, "grad_norm": 0.541170060634613, "learning_rate": 0.0002, "epoch": 0.033444816053511704, "step": 30}, {"loss": 1.9404, "grad_norm": 0.4160577058792114, "learning_rate": 0.0002, "epoch": 0.044593088071348944, "step": 40}, {"loss": 1.9695, "grad_norm": 0.5151045918464661, "learning_rate": 0.0002, "epoch": 0.055741360089186176, "step": 50}, {"loss": 1.9375, "grad_norm": 0.4899227023124695, "learning_rate": 0.0002, "epoch": 0.06688963210702341, "step": 60}, {"loss": 1.8537, "grad_norm": 0.6387737393379211, "learning_rate": 0.0002, "epoch": 0.07803790412486064, "step": 70}, {"loss": 1.8591, "grad_norm": 0.44113653898239136, "learning_rate": 0.0002, "epoch": 0.08918617614269789, "step": 80}, {"loss": 1.9253, "grad_norm": 0.4688360393047333, "learning_rate": 0.0002, "epoch": 0.10033444816053512, "step": 90}, {"loss": 1.9809, "grad_norm": 0.44789502024650574, "learning_rate": 0.0002, "epoch": 0.11148272017837235, "step": 100}, {"loss": 1.8297, "grad_norm": 0.4484880864620209, "learning_rate": 0.0002, "epoch": 0.12263099219620958, "step": 110}, {"loss": 1.8392, "grad_norm": 0.46527230739593506, "learning_rate": 0.0002, "epoch": 0.13377926421404682, "step": 120}, {"loss": 1.8941, "grad_norm": 0.5095470547676086, "learning_rate": 0.0002, "epoch": 0.14492753623188406, "step": 130}, {"loss": 1.8936, "grad_norm": 0.4180101752281189, "learning_rate": 0.0002, "epoch": 0.15607580824972128, "step": 140}, {"loss": 1.8467, "grad_norm": 0.45976975560188293, "learning_rate": 0.0002, "epoch": 0.16722408026755853, "step": 150}, {"loss": 1.8996, "grad_norm": 0.43929311633110046, "learning_rate": 0.0002, "epoch": 0.17837235228539577, "step": 160}, {"loss": 1.828, "grad_norm": 0.43384963274002075, "learning_rate": 0.0002, "epoch": 0.189520624303233, "step": 170}, {"loss": 1.8599, "grad_norm": 0.4810775816440582, "learning_rate": 0.0002, "epoch": 0.20066889632107024, "step": 180}, {"loss": 1.8105, "grad_norm": 0.4231500029563904, "learning_rate": 0.0002, "epoch": 0.21181716833890746, "step": 190}, {"loss": 1.8029, "grad_norm": 0.40217751264572144, "learning_rate": 0.0002, "epoch": 0.2229654403567447, "step": 200}, {"loss": 1.8125, "grad_norm": 0.3772163689136505, "learning_rate": 0.0002, "epoch": 0.23411371237458195, "step": 210}, {"loss": 1.8709, "grad_norm": 0.3765389621257782, "learning_rate": 0.0002, "epoch": 0.24526198439241917, "step": 220}, {"loss": 1.8571, "grad_norm": 0.3947426378726959, "learning_rate": 0.0002, "epoch": 0.2564102564102564, "step": 230}, {"loss": 1.7517, "grad_norm": 0.38083791732788086, "learning_rate": 0.0002, "epoch": 0.26755852842809363, "step": 240}, {"loss": 1.7449, "grad_norm": 0.6683781743049622, "learning_rate": 0.0002, "epoch": 0.2787068004459309, "step": 250}, {"loss": 1.787, "grad_norm": 0.41476085782051086, "learning_rate": 0.0002, "epoch": 0.2898550724637681, "step": 260}, {"loss": 1.8212, "grad_norm": 0.3722982704639435, "learning_rate": 0.0002, "epoch": 0.3010033444816054, "step": 270}, {"loss": 1.8929, "grad_norm": 0.4132225811481476, "learning_rate": 0.0002, "epoch": 0.31215161649944256, "step": 280}, {"loss": 1.9126, "grad_norm": 0.41937923431396484, "learning_rate": 0.0002, "epoch": 0.3232998885172798, "step": 290}, {"loss": 1.9065, "grad_norm": 0.3839682340621948, "learning_rate": 0.0002, "epoch": 0.33444816053511706, "step": 300}, {"loss": 1.8818, "grad_norm": 0.33736854791641235, "learning_rate": 0.0002, "epoch": 0.3455964325529543, "step": 310}, {"loss": 1.8061, "grad_norm": 0.4552125334739685, "learning_rate": 0.0002, "epoch": 0.35674470457079155, "step": 320}, {"loss": 1.8141, "grad_norm": 0.3592551350593567, "learning_rate": 0.0002, "epoch": 0.36789297658862874, "step": 330}, {"loss": 1.8174, "grad_norm": 0.3872784972190857, "learning_rate": 0.0002, "epoch": 0.379041248606466, "step": 340}, {"loss": 1.7789, "grad_norm": 0.35498011112213135, "learning_rate": 0.0002, "epoch": 0.39018952062430323, "step": 350}, {"loss": 1.8456, "grad_norm": 0.3489432632923126, "learning_rate": 0.0002, "epoch": 0.4013377926421405, "step": 360}, {"loss": 1.8374, "grad_norm": 0.3511202037334442, "learning_rate": 0.0002, "epoch": 0.4124860646599777, "step": 370}, {"loss": 1.7845, "grad_norm": 0.3891856074333191, "learning_rate": 0.0002, "epoch": 0.4236343366778149, "step": 380}, {"loss": 1.7828, "grad_norm": 0.4112119972705841, "learning_rate": 0.0002, "epoch": 0.43478260869565216, "step": 390}, {"loss": 1.7746, "grad_norm": 0.3329351246356964, "learning_rate": 0.0002, "epoch": 0.4459308807134894, "step": 400}, {"loss": 1.7894, "grad_norm": 0.32010194659233093, "learning_rate": 0.0002, "epoch": 0.45707915273132665, "step": 410}, {"loss": 1.8266, "grad_norm": 0.3335704505443573, "learning_rate": 0.0002, "epoch": 0.4682274247491639, "step": 420}, {"loss": 1.836, "grad_norm": 0.3508165180683136, "learning_rate": 0.0002, "epoch": 0.4793756967670011, "step": 430}, {"loss": 1.8241, "grad_norm": 0.3818604052066803, "learning_rate": 0.0002, "epoch": 0.49052396878483834, "step": 440}, {"loss": 1.7451, "grad_norm": 0.37044021487236023, "learning_rate": 0.0002, "epoch": 0.5016722408026756, "step": 450}, {"loss": 1.7862, "grad_norm": 0.3258146047592163, "learning_rate": 0.0002, "epoch": 0.5128205128205128, "step": 460}, {"loss": 1.8662, "grad_norm": 0.3390968143939972, "learning_rate": 0.0002, "epoch": 0.5239687848383501, "step": 470}, {"loss": 1.8545, "grad_norm": 0.41194117069244385, "learning_rate": 0.0002, "epoch": 0.5351170568561873, "step": 480}, {"loss": 1.8727, "grad_norm": 0.34630897641181946, "learning_rate": 0.0002, "epoch": 0.5462653288740246, "step": 490}, {"loss": 1.7747, "grad_norm": 0.28459733724594116, "learning_rate": 0.0002, "epoch": 0.5574136008918618, "step": 500}, {"loss": 1.8307, "grad_norm": 0.33051759004592896, "learning_rate": 0.0002, "epoch": 0.568561872909699, "step": 510}, {"loss": 1.8997, "grad_norm": 0.37259650230407715, "learning_rate": 0.0002, "epoch": 0.5797101449275363, "step": 520}, {"loss": 1.8081, "grad_norm": 0.4604213833808899, "learning_rate": 0.0002, "epoch": 0.5908584169453734, "step": 530}, {"loss": 1.7226, "grad_norm": 0.3107241988182068, "learning_rate": 0.0002, "epoch": 0.6020066889632107, "step": 540}, {"loss": 1.8096, "grad_norm": 0.34454235434532166, "learning_rate": 0.0002, "epoch": 0.6131549609810479, "step": 550}, {"loss": 1.8061, "grad_norm": 0.32745128870010376, "learning_rate": 0.0002, "epoch": 0.6243032329988851, "step": 560}, {"loss": 1.8565, "grad_norm": 0.32668930292129517, "learning_rate": 0.0002, "epoch": 0.6354515050167224, "step": 570}, {"loss": 1.7705, "grad_norm": 0.31747013330459595, "learning_rate": 0.0002, "epoch": 0.6465997770345596, "step": 580}, {"loss": 1.7835, "grad_norm": 0.3399045169353485, "learning_rate": 0.0002, "epoch": 0.6577480490523969, "step": 590}, {"loss": 1.8004, "grad_norm": 0.40407994389533997, "learning_rate": 0.0002, "epoch": 0.6688963210702341, "step": 600}, {"loss": 1.8037, "grad_norm": 0.3739639222621918, "learning_rate": 0.0002, "epoch": 0.6800445930880713, "step": 610}, {"loss": 1.8654, "grad_norm": 0.3739263713359833, "learning_rate": 0.0002, "epoch": 0.6911928651059086, "step": 620}, {"loss": 1.8664, "grad_norm": 0.3418176770210266, "learning_rate": 0.0002, "epoch": 0.7023411371237458, "step": 630}, {"loss": 1.8081, "grad_norm": 0.3314031660556793, "learning_rate": 0.0002, "epoch": 0.7134894091415831, "step": 640}, {"loss": 1.7452, "grad_norm": 0.3569042384624481, "learning_rate": 0.0002, "epoch": 0.7246376811594203, "step": 650}, {"loss": 1.8655, "grad_norm": 0.4068199098110199, "learning_rate": 0.0002, "epoch": 0.7357859531772575, "step": 660}, {"loss": 1.748, "grad_norm": 0.385543555021286, "learning_rate": 0.0002, "epoch": 0.7469342251950948, "step": 670}, {"loss": 1.8055, "grad_norm": 0.3103431165218353, "learning_rate": 0.0002, "epoch": 0.758082497212932, "step": 680}, {"loss": 1.7255, "grad_norm": 0.32295092940330505, "learning_rate": 0.0002, "epoch": 0.7692307692307693, "step": 690}, {"loss": 1.7743, "grad_norm": 0.38221824169158936, "learning_rate": 0.0002, "epoch": 0.7803790412486065, "step": 700}, {"loss": 1.7581, "grad_norm": 0.3228561282157898, "learning_rate": 0.0002, "epoch": 0.7915273132664437, "step": 710}, {"loss": 1.8552, "grad_norm": 0.32148292660713196, "learning_rate": 0.0002, "epoch": 0.802675585284281, "step": 720}, {"loss": 1.823, "grad_norm": 0.3125041723251343, "learning_rate": 0.0002, "epoch": 0.8138238573021181, "step": 730}, {"loss": 1.733, "grad_norm": 0.43717217445373535, "learning_rate": 0.0002, "epoch": 0.8249721293199554, "step": 740}, {"loss": 1.7133, "grad_norm": 0.32372939586639404, "learning_rate": 0.0002, "epoch": 0.8361204013377926, "step": 750}, {"loss": 1.7855, "grad_norm": 0.3270736336708069, "learning_rate": 0.0002, "epoch": 0.8472686733556298, "step": 760}, {"loss": 1.8283, "grad_norm": 0.32658815383911133, "learning_rate": 0.0002, "epoch": 0.8584169453734671, "step": 770}, {"loss": 1.7751, "grad_norm": 0.3742631673812866, "learning_rate": 0.0002, "epoch": 0.8695652173913043, "step": 780}, {"loss": 1.7664, "grad_norm": 0.3322608172893524, "learning_rate": 0.0002, "epoch": 0.8807134894091416, "step": 790}, {"loss": 1.7984, "grad_norm": 0.441494882106781, "learning_rate": 0.0002, "epoch": 0.8918617614269788, "step": 800}, {"loss": 1.8352, "grad_norm": 0.38793420791625977, "learning_rate": 0.0002, "epoch": 0.903010033444816, "step": 810}, {"loss": 1.8183, "grad_norm": 0.4095474183559418, "learning_rate": 0.0002, "epoch": 0.9141583054626533, "step": 820}, {"loss": 1.7837, "grad_norm": 0.36847662925720215, "learning_rate": 0.0002, "epoch": 0.9253065774804905, "step": 830}, {"loss": 1.7867, "grad_norm": 0.28806909918785095, "learning_rate": 0.0002, "epoch": 0.9364548494983278, "step": 840}, {"loss": 1.848, "grad_norm": 0.3261156976222992, "learning_rate": 0.0002, "epoch": 0.947603121516165, "step": 850}, {"loss": 1.693, "grad_norm": 0.4674798250198364, "learning_rate": 0.0002, "epoch": 0.9587513935340022, "step": 860}, {"loss": 1.7742, "grad_norm": 0.30819064378738403, "learning_rate": 0.0002, "epoch": 0.9698996655518395, "step": 870}, {"loss": 1.8184, "grad_norm": 0.32203033566474915, "learning_rate": 0.0002, "epoch": 0.9810479375696767, "step": 880}, {"loss": 1.7701, "grad_norm": 0.3409714102745056, "learning_rate": 0.0002, "epoch": 0.992196209587514, "step": 890}, {"eval_loss": 1.8143481016159058, "eval_runtime": 37.921, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 1.0, "step": 897}, {"loss": 1.8029, "grad_norm": 0.29757317900657654, "learning_rate": 0.0002, "epoch": 1.0033444816053512, "step": 900}, {"loss": 1.7376, "grad_norm": 0.32168492674827576, "learning_rate": 0.0002, "epoch": 1.0144927536231885, "step": 910}, {"loss": 1.6785, "grad_norm": 0.3430717885494232, "learning_rate": 0.0002, "epoch": 1.0256410256410255, "step": 920}, {"loss": 1.7356, "grad_norm": 0.3431745767593384, "learning_rate": 0.0002, "epoch": 1.0367892976588629, "step": 930}, {"loss": 1.7932, "grad_norm": 0.39787548780441284, "learning_rate": 0.0002, "epoch": 1.0479375696767002, "step": 940}, {"loss": 1.7434, "grad_norm": 0.3540935218334198, "learning_rate": 0.0002, "epoch": 1.0590858416945372, "step": 950}, {"loss": 1.7693, "grad_norm": 0.368484765291214, "learning_rate": 0.0002, "epoch": 1.0702341137123745, "step": 960}, {"loss": 1.6887, "grad_norm": 0.41324466466903687, "learning_rate": 0.0002, "epoch": 1.0813823857302118, "step": 970}, {"loss": 1.7288, "grad_norm": 0.3696419596672058, "learning_rate": 0.0002, "epoch": 1.0925306577480491, "step": 980}, {"loss": 1.7743, "grad_norm": 0.33832886815071106, "learning_rate": 0.0002, "epoch": 1.1036789297658862, "step": 990}, {"loss": 1.7445, "grad_norm": 0.4411991834640503, "learning_rate": 0.0002, "epoch": 1.1148272017837235, "step": 1000}, {"loss": 1.7699, "grad_norm": 0.3935333788394928, "learning_rate": 0.0002, "epoch": 1.1259754738015608, "step": 1010}, {"loss": 1.6909, "grad_norm": 0.32472893595695496, "learning_rate": 0.0002, "epoch": 1.137123745819398, "step": 1020}, {"loss": 1.6974, "grad_norm": 0.3455545902252197, "learning_rate": 0.0002, "epoch": 1.1482720178372352, "step": 1030}, {"loss": 1.7555, "grad_norm": 0.3995654582977295, "learning_rate": 0.0002, "epoch": 1.1594202898550725, "step": 1040}, {"loss": 1.7419, "grad_norm": 0.384056031703949, "learning_rate": 0.0002, "epoch": 1.1705685618729098, "step": 1050}, {"loss": 1.7693, "grad_norm": 0.4345705211162567, "learning_rate": 0.0002, "epoch": 1.1817168338907469, "step": 1060}, {"loss": 1.7219, "grad_norm": 0.3524057865142822, "learning_rate": 0.0002, "epoch": 1.1928651059085842, "step": 1070}, {"loss": 1.6701, "grad_norm": 0.4047132134437561, "learning_rate": 0.0002, "epoch": 1.2040133779264215, "step": 1080}, {"loss": 1.7035, "grad_norm": 0.365824431180954, "learning_rate": 0.0002, "epoch": 1.2151616499442586, "step": 1090}, {"loss": 1.7367, "grad_norm": 0.37048354744911194, "learning_rate": 0.0002, "epoch": 1.2263099219620959, "step": 1100}, {"loss": 1.7503, "grad_norm": 0.3753672242164612, "learning_rate": 0.0002, "epoch": 1.2374581939799332, "step": 1110}, {"loss": 1.6984, "grad_norm": 0.37887042760849, "learning_rate": 0.0002, "epoch": 1.2486064659977703, "step": 1120}, {"loss": 1.7866, "grad_norm": 0.3896579444408417, "learning_rate": 0.0002, "epoch": 1.2597547380156076, "step": 1130}, {"loss": 1.8085, "grad_norm": 0.3725394010543823, "learning_rate": 0.0002, "epoch": 1.2709030100334449, "step": 1140}, {"loss": 1.6942, "grad_norm": 0.373989999294281, "learning_rate": 0.0002, "epoch": 1.282051282051282, "step": 1150}, {"loss": 1.7566, "grad_norm": 0.4412260353565216, "learning_rate": 0.0002, "epoch": 1.2931995540691192, "step": 1160}, {"loss": 1.7425, "grad_norm": 0.38538658618927, "learning_rate": 0.0002, "epoch": 1.3043478260869565, "step": 1170}, {"loss": 1.6573, "grad_norm": 0.3644104599952698, "learning_rate": 0.0002, "epoch": 1.3154960981047936, "step": 1180}, {"loss": 1.6186, "grad_norm": 0.3615347743034363, "learning_rate": 0.0002, "epoch": 1.326644370122631, "step": 1190}, {"loss": 1.7575, "grad_norm": 0.4260489046573639, "learning_rate": 0.0002, "epoch": 1.3377926421404682, "step": 1200}, {"loss": 1.762, "grad_norm": 0.35236871242523193, "learning_rate": 0.0002, "epoch": 1.3489409141583055, "step": 1210}, {"loss": 1.7207, "grad_norm": 0.45456627011299133, "learning_rate": 0.0002, "epoch": 1.3600891861761428, "step": 1220}, {"loss": 1.7391, "grad_norm": 0.391541063785553, "learning_rate": 0.0002, "epoch": 1.37123745819398, "step": 1230}, {"loss": 1.7309, "grad_norm": 0.37955328822135925, "learning_rate": 0.0002, "epoch": 1.3823857302118172, "step": 1240}, {"loss": 1.7028, "grad_norm": 0.36955225467681885, "learning_rate": 0.0002, "epoch": 1.3935340022296545, "step": 1250}, {"loss": 1.7027, "grad_norm": 0.36156216263771057, "learning_rate": 0.0002, "epoch": 1.4046822742474916, "step": 1260}, {"loss": 1.8091, "grad_norm": 0.4083487391471863, "learning_rate": 0.0002, "epoch": 1.415830546265329, "step": 1270}, {"loss": 1.7551, "grad_norm": 0.420171320438385, "learning_rate": 0.0002, "epoch": 1.4269788182831662, "step": 1280}, {"loss": 1.7377, "grad_norm": 0.3581725060939789, "learning_rate": 0.0002, "epoch": 1.4381270903010033, "step": 1290}, {"loss": 1.728, "grad_norm": 0.3657953441143036, "learning_rate": 0.0002, "epoch": 1.4492753623188406, "step": 1300}, {"loss": 1.7116, "grad_norm": 0.3139931857585907, "learning_rate": 0.0002, "epoch": 1.4604236343366779, "step": 1310}, {"loss": 1.671, "grad_norm": 0.37750574946403503, "learning_rate": 0.0002, "epoch": 1.471571906354515, "step": 1320}, {"loss": 1.7663, "grad_norm": 0.37787437438964844, "learning_rate": 0.0002, "epoch": 1.4827201783723523, "step": 1330}, {"loss": 1.6403, "grad_norm": 0.39505279064178467, "learning_rate": 0.0002, "epoch": 1.4938684503901896, "step": 1340}, {"loss": 1.7745, "grad_norm": 0.39977672696113586, "learning_rate": 0.0002, "epoch": 1.5050167224080266, "step": 1350}, {"loss": 1.7339, "grad_norm": 0.4395383298397064, "learning_rate": 0.0002, "epoch": 1.516164994425864, "step": 1360}, {"loss": 1.7315, "grad_norm": 0.3452998995780945, "learning_rate": 0.0002, "epoch": 1.5273132664437012, "step": 1370}, {"loss": 1.7244, "grad_norm": 0.39573904871940613, "learning_rate": 0.0002, "epoch": 1.5384615384615383, "step": 1380}, {"loss": 1.7453, "grad_norm": 0.4886358976364136, "learning_rate": 0.0002, "epoch": 1.5496098104793758, "step": 1390}, {"loss": 1.7294, "grad_norm": 0.35525891184806824, "learning_rate": 0.0002, "epoch": 1.560758082497213, "step": 1400}, {"loss": 1.6896, "grad_norm": 0.3873274028301239, "learning_rate": 0.0002, "epoch": 1.57190635451505, "step": 1410}, {"loss": 1.7545, "grad_norm": 0.35162487626075745, "learning_rate": 0.0002, "epoch": 1.5830546265328875, "step": 1420}, {"loss": 1.7403, "grad_norm": 0.3533175587654114, "learning_rate": 0.0002, "epoch": 1.5942028985507246, "step": 1430}, {"loss": 1.7199, "grad_norm": 0.35397887229919434, "learning_rate": 0.0002, "epoch": 1.605351170568562, "step": 1440}, {"loss": 1.701, "grad_norm": 0.3539091646671295, "learning_rate": 0.0002, "epoch": 1.6164994425863992, "step": 1450}, {"loss": 1.7407, "grad_norm": 0.38557013869285583, "learning_rate": 0.0002, "epoch": 1.6276477146042363, "step": 1460}, {"loss": 1.6896, "grad_norm": 0.3591409921646118, "learning_rate": 0.0002, "epoch": 1.6387959866220736, "step": 1470}, {"loss": 1.6831, "grad_norm": 0.3776722848415375, "learning_rate": 0.0002, "epoch": 1.649944258639911, "step": 1480}, {"loss": 1.7511, "grad_norm": 0.3761521875858307, "learning_rate": 0.0002, "epoch": 1.661092530657748, "step": 1490}, {"loss": 1.7464, "grad_norm": 0.33939364552497864, "learning_rate": 0.0002, "epoch": 1.6722408026755853, "step": 1500}, {"loss": 1.6522, "grad_norm": 0.3961067795753479, "learning_rate": 0.0002, "epoch": 1.6833890746934226, "step": 1510}, {"loss": 1.7849, "grad_norm": 0.36793094873428345, "learning_rate": 0.0002, "epoch": 1.6945373467112597, "step": 1520}, {"loss": 1.7057, "grad_norm": 0.4201025068759918, "learning_rate": 0.0002, "epoch": 1.705685618729097, "step": 1530}, {"loss": 1.6656, "grad_norm": 0.382280558347702, "learning_rate": 0.0002, "epoch": 1.7168338907469343, "step": 1540}, {"loss": 1.7987, "grad_norm": 0.4504372477531433, "learning_rate": 0.0002, "epoch": 1.7279821627647713, "step": 1550}, {"loss": 1.7889, "grad_norm": 0.36121585965156555, "learning_rate": 0.0002, "epoch": 1.7391304347826086, "step": 1560}, {"loss": 1.7282, "grad_norm": 0.38416755199432373, "learning_rate": 0.0002, "epoch": 1.750278706800446, "step": 1570}, {"loss": 1.7759, "grad_norm": 0.3920411467552185, "learning_rate": 0.0002, "epoch": 1.761426978818283, "step": 1580}, {"loss": 1.7693, "grad_norm": 0.4326777756214142, "learning_rate": 0.0002, "epoch": 1.7725752508361206, "step": 1590}, {"loss": 1.6804, "grad_norm": 0.3582489490509033, "learning_rate": 0.0002, "epoch": 1.7837235228539576, "step": 1600}, {"loss": 1.706, "grad_norm": 0.36345767974853516, "learning_rate": 0.0002, "epoch": 1.7948717948717947, "step": 1610}, {"loss": 1.75, "grad_norm": 0.3951990008354187, "learning_rate": 0.0002, "epoch": 1.8060200668896322, "step": 1620}, {"loss": 1.8034, "grad_norm": 0.35174235701560974, "learning_rate": 0.0002, "epoch": 1.8171683389074693, "step": 1630}, {"loss": 1.725, "grad_norm": 0.37005263566970825, "learning_rate": 0.0002, "epoch": 1.8283166109253066, "step": 1640}, {"loss": 1.695, "grad_norm": 0.42875173687934875, "learning_rate": 0.0002, "epoch": 1.839464882943144, "step": 1650}, {"loss": 1.7589, "grad_norm": 0.3646032512187958, "learning_rate": 0.0002, "epoch": 1.850613154960981, "step": 1660}, {"loss": 1.6698, "grad_norm": 0.38111618161201477, "learning_rate": 0.0002, "epoch": 1.8617614269788183, "step": 1670}, {"loss": 1.7832, "grad_norm": 0.3825555443763733, "learning_rate": 0.0002, "epoch": 1.8729096989966556, "step": 1680}, {"loss": 1.7599, "grad_norm": 0.36418095231056213, "learning_rate": 0.0002, "epoch": 1.8840579710144927, "step": 1690}, {"loss": 1.6532, "grad_norm": 0.36551007628440857, "learning_rate": 0.0002, "epoch": 1.89520624303233, "step": 1700}, {"loss": 1.7174, "grad_norm": 0.36421480774879456, "learning_rate": 0.0002, "epoch": 1.9063545150501673, "step": 1710}, {"loss": 1.7176, "grad_norm": 0.3791242241859436, "learning_rate": 0.0002, "epoch": 1.9175027870680044, "step": 1720}, {"loss": 1.7961, "grad_norm": 0.36655193567276, "learning_rate": 0.0002, "epoch": 1.9286510590858417, "step": 1730}, {"loss": 1.7765, "grad_norm": 0.3526945412158966, "learning_rate": 0.0002, "epoch": 1.939799331103679, "step": 1740}, {"loss": 1.7047, "grad_norm": 0.41139861941337585, "learning_rate": 0.0002, "epoch": 1.950947603121516, "step": 1750}, {"loss": 1.8155, "grad_norm": 0.41757065057754517, "learning_rate": 0.0002, "epoch": 1.9620958751393534, "step": 1760}, {"loss": 1.7271, "grad_norm": 0.38956186175346375, "learning_rate": 0.0002, "epoch": 1.9732441471571907, "step": 1770}, {"loss": 1.7653, "grad_norm": 0.33891627192497253, "learning_rate": 0.0002, "epoch": 1.9843924191750277, "step": 1780}, {"loss": 1.7305, "grad_norm": 0.42879191040992737, "learning_rate": 0.0002, "epoch": 1.9955406911928653, "step": 1790}, {"eval_loss": 1.8116765022277832, "eval_runtime": 37.9859, "eval_samples_per_second": 13.558, "eval_steps_per_second": 1.711, "epoch": 2.0, "step": 1794}, {"loss": 1.6724, "grad_norm": 0.42103368043899536, "learning_rate": 0.0002, "epoch": 2.0066889632107023, "step": 1800}, {"loss": 1.5812, "grad_norm": 0.41505053639411926, "learning_rate": 0.0002, "epoch": 2.0178372352285394, "step": 1810}, {"loss": 1.6132, "grad_norm": 0.398190438747406, "learning_rate": 0.0002, "epoch": 2.028985507246377, "step": 1820}, {"loss": 1.6497, "grad_norm": 0.4371621310710907, "learning_rate": 0.0002, "epoch": 2.040133779264214, "step": 1830}, {"loss": 1.6501, "grad_norm": 0.45679208636283875, "learning_rate": 0.0002, "epoch": 2.051282051282051, "step": 1840}, {"loss": 1.5773, "grad_norm": 0.43211811780929565, "learning_rate": 0.0002, "epoch": 2.0624303232998886, "step": 1850}, {"loss": 1.6414, "grad_norm": 0.47492915391921997, "learning_rate": 0.0002, "epoch": 2.0735785953177257, "step": 1860}, {"loss": 1.7169, "grad_norm": 0.41742339730262756, "learning_rate": 0.0002, "epoch": 2.084726867335563, "step": 1870}, {"loss": 1.5762, "grad_norm": 0.45789217948913574, "learning_rate": 0.0002, "epoch": 2.0958751393534003, "step": 1880}, {"loss": 1.6896, "grad_norm": 0.43958935141563416, "learning_rate": 0.0002, "epoch": 2.1070234113712374, "step": 1890}, {"loss": 1.6444, "grad_norm": 0.43991968035697937, "learning_rate": 0.0002, "epoch": 2.1181716833890745, "step": 1900}, {"loss": 1.6057, "grad_norm": 0.4667953848838806, "learning_rate": 0.0002, "epoch": 2.129319955406912, "step": 1910}, {"loss": 1.5999, "grad_norm": 0.42225760221481323, "learning_rate": 0.0002, "epoch": 2.140468227424749, "step": 1920}, {"loss": 1.6525, "grad_norm": 0.418850839138031, "learning_rate": 0.0002, "epoch": 2.1516164994425866, "step": 1930}, {"loss": 1.6091, "grad_norm": 0.43838515877723694, "learning_rate": 0.0002, "epoch": 2.1627647714604237, "step": 1940}, {"loss": 1.6837, "grad_norm": 0.43798115849494934, "learning_rate": 0.0002, "epoch": 2.1739130434782608, "step": 1950}, {"loss": 1.632, "grad_norm": 0.4456610679626465, "learning_rate": 0.0002, "epoch": 2.1850613154960983, "step": 1960}, {"loss": 1.6338, "grad_norm": 0.4619026482105255, "learning_rate": 0.0002, "epoch": 2.1962095875139354, "step": 1970}, {"loss": 1.6989, "grad_norm": 0.4732453525066376, "learning_rate": 0.0002, "epoch": 2.2073578595317724, "step": 1980}, {"loss": 1.581, "grad_norm": 0.42551836371421814, "learning_rate": 0.0002, "epoch": 2.21850613154961, "step": 1990}, {"loss": 1.6386, "grad_norm": 0.45154353976249695, "learning_rate": 0.0002, "epoch": 2.229654403567447, "step": 2000}, {"loss": 1.6768, "grad_norm": 0.4655696451663971, "learning_rate": 0.0002, "epoch": 2.240802675585284, "step": 2010}, {"loss": 1.6972, "grad_norm": 0.5363447666168213, "learning_rate": 0.0002, "epoch": 2.2519509476031216, "step": 2020}, {"loss": 1.6561, "grad_norm": 0.4839927852153778, "learning_rate": 0.0002, "epoch": 2.2630992196209587, "step": 2030}, {"loss": 1.6838, "grad_norm": 0.4639221727848053, "learning_rate": 0.0002, "epoch": 2.274247491638796, "step": 2040}, {"loss": 1.6063, "grad_norm": 0.46169278025627136, "learning_rate": 0.0002, "epoch": 2.2853957636566333, "step": 2050}, {"loss": 1.5924, "grad_norm": 0.4582304060459137, "learning_rate": 0.0002, "epoch": 2.2965440356744704, "step": 2060}, {"loss": 1.5778, "grad_norm": 0.48619818687438965, "learning_rate": 0.0002, "epoch": 2.3076923076923075, "step": 2070}, {"loss": 1.633, "grad_norm": 0.4382200241088867, "learning_rate": 0.0002, "epoch": 2.318840579710145, "step": 2080}, {"loss": 1.5854, "grad_norm": 0.4103265106678009, "learning_rate": 0.0002, "epoch": 2.329988851727982, "step": 2090}, {"loss": 1.7042, "grad_norm": 0.5136023759841919, "learning_rate": 0.0002, "epoch": 2.3411371237458196, "step": 2100}, {"loss": 1.5723, "grad_norm": 0.46723702549934387, "learning_rate": 0.0002, "epoch": 2.3522853957636567, "step": 2110}, {"loss": 1.6852, "grad_norm": 0.42269468307495117, "learning_rate": 0.0002, "epoch": 2.3634336677814938, "step": 2120}, {"loss": 1.6369, "grad_norm": 0.42611163854599, "learning_rate": 0.0002, "epoch": 2.374581939799331, "step": 2130}, {"loss": 1.5879, "grad_norm": 0.4573901891708374, "learning_rate": 0.0002, "epoch": 2.3857302118171684, "step": 2140}, {"loss": 1.6317, "grad_norm": 0.4758673310279846, "learning_rate": 0.0002, "epoch": 2.3968784838350055, "step": 2150}, {"loss": 1.6527, "grad_norm": 0.49616846442222595, "learning_rate": 0.0002, "epoch": 2.408026755852843, "step": 2160}, {"loss": 1.5796, "grad_norm": 0.5278240442276001, "learning_rate": 0.0002, "epoch": 2.41917502787068, "step": 2170}, {"loss": 1.6746, "grad_norm": 0.46806028485298157, "learning_rate": 0.0002, "epoch": 2.430323299888517, "step": 2180}, {"loss": 1.676, "grad_norm": 0.44507312774658203, "learning_rate": 0.0002, "epoch": 2.4414715719063547, "step": 2190}, {"loss": 1.6793, "grad_norm": 0.45716050267219543, "learning_rate": 0.0002, "epoch": 2.4526198439241917, "step": 2200}, {"loss": 1.6198, "grad_norm": 0.4226573705673218, "learning_rate": 0.0002, "epoch": 2.463768115942029, "step": 2210}, {"loss": 1.5721, "grad_norm": 0.4488418400287628, "learning_rate": 0.0002, "epoch": 2.4749163879598663, "step": 2220}, {"loss": 1.6399, "grad_norm": 0.48324450850486755, "learning_rate": 0.0002, "epoch": 2.4860646599777034, "step": 2230}, {"loss": 1.6228, "grad_norm": 0.4866982400417328, "learning_rate": 0.0002, "epoch": 2.4972129319955405, "step": 2240}, {"loss": 1.6887, "grad_norm": 0.4784172773361206, "learning_rate": 0.0002, "epoch": 2.508361204013378, "step": 2250}, {"loss": 1.6905, "grad_norm": 0.4250621199607849, "learning_rate": 0.0002, "epoch": 2.519509476031215, "step": 2260}, {"loss": 1.6582, "grad_norm": 0.431224524974823, "learning_rate": 0.0002, "epoch": 2.5306577480490526, "step": 2270}, {"loss": 1.5981, "grad_norm": 0.3931371867656708, "learning_rate": 0.0002, "epoch": 2.5418060200668897, "step": 2280}, {"loss": 1.6897, "grad_norm": 0.4800887703895569, "learning_rate": 0.0002, "epoch": 2.552954292084727, "step": 2290}, {"loss": 1.6205, "grad_norm": 0.4288487136363983, "learning_rate": 0.0002, "epoch": 2.564102564102564, "step": 2300}, {"loss": 1.6005, "grad_norm": 0.48489660024642944, "learning_rate": 0.0002, "epoch": 2.5752508361204014, "step": 2310}, {"loss": 1.6447, "grad_norm": 0.4221740961074829, "learning_rate": 0.0002, "epoch": 2.5863991081382385, "step": 2320}, {"loss": 1.666, "grad_norm": 0.4413852393627167, "learning_rate": 0.0002, "epoch": 2.597547380156076, "step": 2330}, {"loss": 1.6863, "grad_norm": 0.4391345679759979, "learning_rate": 0.0002, "epoch": 2.608695652173913, "step": 2340}, {"loss": 1.6942, "grad_norm": 0.4824720323085785, "learning_rate": 0.0002, "epoch": 2.61984392419175, "step": 2350}, {"loss": 1.5615, "grad_norm": 0.4023158550262451, "learning_rate": 0.0002, "epoch": 2.6309921962095872, "step": 2360}, {"loss": 1.698, "grad_norm": 0.5107841491699219, "learning_rate": 0.0002, "epoch": 2.6421404682274248, "step": 2370}, {"loss": 1.6258, "grad_norm": 0.4705312252044678, "learning_rate": 0.0002, "epoch": 2.653288740245262, "step": 2380}, {"loss": 1.7294, "grad_norm": 0.4420899450778961, "learning_rate": 0.0002, "epoch": 2.6644370122630994, "step": 2390}, {"loss": 1.6246, "grad_norm": 0.413308709859848, "learning_rate": 0.0002, "epoch": 2.6755852842809364, "step": 2400}, {"loss": 1.565, "grad_norm": 0.4312658905982971, "learning_rate": 0.0002, "epoch": 2.6867335562987735, "step": 2410}, {"loss": 1.617, "grad_norm": 0.44714513421058655, "learning_rate": 0.0002, "epoch": 2.697881828316611, "step": 2420}, {"loss": 1.6185, "grad_norm": 0.49152931571006775, "learning_rate": 0.0002, "epoch": 2.709030100334448, "step": 2430}, {"loss": 1.5864, "grad_norm": 0.49458765983581543, "learning_rate": 0.0002, "epoch": 2.7201783723522857, "step": 2440}, {"loss": 1.6535, "grad_norm": 0.47838348150253296, "learning_rate": 0.0002, "epoch": 2.7313266443701227, "step": 2450}, {"loss": 1.6836, "grad_norm": 0.5781240463256836, "learning_rate": 0.0002, "epoch": 2.74247491638796, "step": 2460}, {"loss": 1.6141, "grad_norm": 0.4559851884841919, "learning_rate": 0.0002, "epoch": 2.753623188405797, "step": 2470}, {"loss": 1.5589, "grad_norm": 0.4452647566795349, "learning_rate": 0.0002, "epoch": 2.7647714604236344, "step": 2480}, {"loss": 1.6209, "grad_norm": 0.43920454382896423, "learning_rate": 0.0002, "epoch": 2.7759197324414715, "step": 2490}, {"loss": 1.5593, "grad_norm": 0.467780739068985, "learning_rate": 0.0002, "epoch": 2.787068004459309, "step": 2500}, {"loss": 1.6438, "grad_norm": 0.4743262529373169, "learning_rate": 0.0002, "epoch": 2.798216276477146, "step": 2510}, {"loss": 1.6084, "grad_norm": 0.47944432497024536, "learning_rate": 0.0002, "epoch": 2.809364548494983, "step": 2520}, {"loss": 1.6756, "grad_norm": 0.48032790422439575, "learning_rate": 0.0002, "epoch": 2.8205128205128203, "step": 2530}, {"loss": 1.6222, "grad_norm": 0.45569729804992676, "learning_rate": 0.0002, "epoch": 2.831661092530658, "step": 2540}, {"loss": 1.6187, "grad_norm": 0.47940587997436523, "learning_rate": 0.0002, "epoch": 2.842809364548495, "step": 2550}, {"loss": 1.6286, "grad_norm": 0.5215432047843933, "learning_rate": 0.0002, "epoch": 2.8539576365663324, "step": 2560}, {"loss": 1.6718, "grad_norm": 0.4421178102493286, "learning_rate": 0.0002, "epoch": 2.8651059085841695, "step": 2570}, {"loss": 1.6201, "grad_norm": 0.45288747549057007, "learning_rate": 0.0002, "epoch": 2.8762541806020065, "step": 2580}, {"loss": 1.5938, "grad_norm": 0.4472251832485199, "learning_rate": 0.0002, "epoch": 2.887402452619844, "step": 2590}, {"loss": 1.7212, "grad_norm": 0.4396503269672394, "learning_rate": 0.0002, "epoch": 2.898550724637681, "step": 2600}, {"loss": 1.6503, "grad_norm": 0.48590990900993347, "learning_rate": 0.0002, "epoch": 2.9096989966555182, "step": 2610}, {"loss": 1.5914, "grad_norm": 0.4787760376930237, "learning_rate": 0.0002, "epoch": 2.9208472686733558, "step": 2620}, {"loss": 1.717, "grad_norm": 0.4807611107826233, "learning_rate": 0.0002, "epoch": 2.931995540691193, "step": 2630}, {"loss": 1.6794, "grad_norm": 0.4625583291053772, "learning_rate": 0.0002, "epoch": 2.94314381270903, "step": 2640}, {"loss": 1.663, "grad_norm": 0.4163573980331421, "learning_rate": 0.0002, "epoch": 2.9542920847268674, "step": 2650}, {"loss": 1.6321, "grad_norm": 0.5142832398414612, "learning_rate": 0.0002, "epoch": 2.9654403567447045, "step": 2660}, {"loss": 1.6183, "grad_norm": 0.4459492564201355, "learning_rate": 0.0002, "epoch": 2.976588628762542, "step": 2670}, {"loss": 1.662, "grad_norm": 0.42905503511428833, "learning_rate": 0.0002, "epoch": 2.987736900780379, "step": 2680}, {"loss": 1.6796, "grad_norm": 0.44594648480415344, "learning_rate": 0.0002, "epoch": 2.998885172798216, "step": 2690}, {"eval_loss": 1.8300215005874634, "eval_runtime": 38.0349, "eval_samples_per_second": 13.54, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 2691}, {"loss": 1.5768, "grad_norm": 0.4742245078086853, "learning_rate": 0.0002, "epoch": 3.0100334448160537, "step": 2700}, {"loss": 1.4859, "grad_norm": 0.5157448649406433, "learning_rate": 0.0002, "epoch": 3.021181716833891, "step": 2710}, {"loss": 1.4219, "grad_norm": 0.5634726285934448, "learning_rate": 0.0002, "epoch": 3.032329988851728, "step": 2720}, {"loss": 1.5452, "grad_norm": 0.4554799199104309, "learning_rate": 0.0002, "epoch": 3.0434782608695654, "step": 2730}, {"loss": 1.4784, "grad_norm": 0.6565208435058594, "learning_rate": 0.0002, "epoch": 3.0546265328874025, "step": 2740}, {"loss": 1.459, "grad_norm": 0.6174370050430298, "learning_rate": 0.0002, "epoch": 3.0657748049052396, "step": 2750}, {"loss": 1.469, "grad_norm": 0.4987483024597168, "learning_rate": 0.0002, "epoch": 3.076923076923077, "step": 2760}, {"loss": 1.5466, "grad_norm": 0.5810927152633667, "learning_rate": 0.0002, "epoch": 3.088071348940914, "step": 2770}, {"loss": 1.4936, "grad_norm": 0.5281634330749512, "learning_rate": 0.0002, "epoch": 3.0992196209587513, "step": 2780}, {"loss": 1.4751, "grad_norm": 0.5479053854942322, "learning_rate": 0.0002, "epoch": 3.1103678929765888, "step": 2790}, {"loss": 1.5601, "grad_norm": 0.6192978620529175, "learning_rate": 0.0002, "epoch": 3.121516164994426, "step": 2800}, {"loss": 1.4888, "grad_norm": 0.560117781162262, "learning_rate": 0.0002, "epoch": 3.132664437012263, "step": 2810}, {"loss": 1.5495, "grad_norm": 0.6067224740982056, "learning_rate": 0.0002, "epoch": 3.1438127090301005, "step": 2820}, {"loss": 1.5239, "grad_norm": 0.611287534236908, "learning_rate": 0.0002, "epoch": 3.1549609810479375, "step": 2830}, {"loss": 1.4577, "grad_norm": 0.6441587209701538, "learning_rate": 0.0002, "epoch": 3.1661092530657746, "step": 2840}, {"loss": 1.5322, "grad_norm": 0.5955114364624023, "learning_rate": 0.0002, "epoch": 3.177257525083612, "step": 2850}, {"loss": 1.5222, "grad_norm": 0.5554782748222351, "learning_rate": 0.0002, "epoch": 3.1884057971014492, "step": 2860}, {"loss": 1.4676, "grad_norm": 0.5411370992660522, "learning_rate": 0.0002, "epoch": 3.1995540691192863, "step": 2870}, {"loss": 1.5008, "grad_norm": 0.6152016520500183, "learning_rate": 0.0002, "epoch": 3.210702341137124, "step": 2880}, {"loss": 1.5229, "grad_norm": 0.5711581110954285, "learning_rate": 0.0002, "epoch": 3.221850613154961, "step": 2890}, {"loss": 1.5255, "grad_norm": 0.5399307012557983, "learning_rate": 0.0002, "epoch": 3.2329988851727984, "step": 2900}, {"loss": 1.4888, "grad_norm": 0.60606849193573, "learning_rate": 0.0002, "epoch": 3.2441471571906355, "step": 2910}, {"loss": 1.5056, "grad_norm": 0.5873523950576782, "learning_rate": 0.0002, "epoch": 3.2552954292084726, "step": 2920}, {"loss": 1.5208, "grad_norm": 0.6149439215660095, "learning_rate": 0.0002, "epoch": 3.26644370122631, "step": 2930}, {"loss": 1.4942, "grad_norm": 0.5940659046173096, "learning_rate": 0.0002, "epoch": 3.277591973244147, "step": 2940}, {"loss": 1.5031, "grad_norm": 0.6846756339073181, "learning_rate": 0.0002, "epoch": 3.2887402452619843, "step": 2950}, {"loss": 1.5425, "grad_norm": 0.6708254218101501, "learning_rate": 0.0002, "epoch": 3.299888517279822, "step": 2960}, {"loss": 1.5319, "grad_norm": 0.5966503620147705, "learning_rate": 0.0002, "epoch": 3.311036789297659, "step": 2970}, {"loss": 1.5173, "grad_norm": 0.6328812837600708, "learning_rate": 0.0002, "epoch": 3.322185061315496, "step": 2980}, {"loss": 1.5096, "grad_norm": 0.6082745790481567, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2990}, {"loss": 1.5122, "grad_norm": 0.6207539439201355, "learning_rate": 0.0002, "epoch": 3.3444816053511706, "step": 3000}, {"loss": 1.5053, "grad_norm": 0.5501793026924133, "learning_rate": 0.0002, "epoch": 3.3556298773690076, "step": 3010}, {"loss": 1.4428, "grad_norm": 0.571275532245636, "learning_rate": 0.0002, "epoch": 3.366778149386845, "step": 3020}, {"loss": 1.5914, "grad_norm": 0.7003518342971802, "learning_rate": 0.0002, "epoch": 3.3779264214046822, "step": 3030}, {"loss": 1.5359, "grad_norm": 0.609527587890625, "learning_rate": 0.0002, "epoch": 3.3890746934225193, "step": 3040}, {"loss": 1.5072, "grad_norm": 0.5880036354064941, "learning_rate": 0.0002, "epoch": 3.400222965440357, "step": 3050}, {"loss": 1.5451, "grad_norm": 0.5847334265708923, "learning_rate": 0.0002, "epoch": 3.411371237458194, "step": 3060}, {"loss": 1.4738, "grad_norm": 0.5373924970626831, "learning_rate": 0.0002, "epoch": 3.4225195094760315, "step": 3070}, {"loss": 1.5215, "grad_norm": 0.6074833869934082, "learning_rate": 0.0002, "epoch": 3.4336677814938685, "step": 3080}, {"loss": 1.458, "grad_norm": 0.5118414163589478, "learning_rate": 0.0002, "epoch": 3.4448160535117056, "step": 3090}, {"loss": 1.5006, "grad_norm": 0.5577956438064575, "learning_rate": 0.0002, "epoch": 3.4559643255295427, "step": 3100}, {"loss": 1.5057, "grad_norm": 0.5654811859130859, "learning_rate": 0.0002, "epoch": 3.46711259754738, "step": 3110}, {"loss": 1.523, "grad_norm": 0.6216017603874207, "learning_rate": 0.0002, "epoch": 3.4782608695652173, "step": 3120}, {"loss": 1.5292, "grad_norm": 0.5983642339706421, "learning_rate": 0.0002, "epoch": 3.489409141583055, "step": 3130}, {"loss": 1.5568, "grad_norm": 0.6635708212852478, "learning_rate": 0.0002, "epoch": 3.500557413600892, "step": 3140}, {"loss": 1.4633, "grad_norm": 0.6254258751869202, "learning_rate": 0.0002, "epoch": 3.511705685618729, "step": 3150}, {"loss": 1.4934, "grad_norm": 0.6359851360321045, "learning_rate": 0.0002, "epoch": 3.522853957636566, "step": 3160}, {"loss": 1.4693, "grad_norm": 0.5938616394996643, "learning_rate": 0.0002, "epoch": 3.5340022296544036, "step": 3170}, {"loss": 1.4393, "grad_norm": 0.6360630393028259, "learning_rate": 0.0002, "epoch": 3.5451505016722407, "step": 3180}, {"loss": 1.5535, "grad_norm": 0.6097670197486877, "learning_rate": 0.0002, "epoch": 3.556298773690078, "step": 3190}, {"loss": 1.5427, "grad_norm": 0.5984025597572327, "learning_rate": 0.0002, "epoch": 3.5674470457079153, "step": 3200}, {"loss": 1.4741, "grad_norm": 0.5463748574256897, "learning_rate": 0.0002, "epoch": 3.5785953177257523, "step": 3210}, {"loss": 1.513, "grad_norm": 1.0017699003219604, "learning_rate": 0.0002, "epoch": 3.58974358974359, "step": 3220}, {"loss": 1.5687, "grad_norm": 0.6519441604614258, "learning_rate": 0.0002, "epoch": 3.600891861761427, "step": 3230}, {"loss": 1.5168, "grad_norm": 0.6457271575927734, "learning_rate": 0.0002, "epoch": 3.6120401337792645, "step": 3240}, {"loss": 1.5511, "grad_norm": 0.5898868441581726, "learning_rate": 0.0002, "epoch": 3.6231884057971016, "step": 3250}, {"loss": 1.5833, "grad_norm": 0.6612270474433899, "learning_rate": 0.0002, "epoch": 3.6343366778149386, "step": 3260}, {"loss": 1.4537, "grad_norm": 0.5102090239524841, "learning_rate": 0.0002, "epoch": 3.6454849498327757, "step": 3270}, {"loss": 1.4676, "grad_norm": 0.5357231497764587, "learning_rate": 0.0002, "epoch": 3.6566332218506132, "step": 3280}, {"loss": 1.5417, "grad_norm": 0.6176130175590515, "learning_rate": 0.0002, "epoch": 3.6677814938684503, "step": 3290}, {"loss": 1.5057, "grad_norm": 0.6384354829788208, "learning_rate": 0.0002, "epoch": 3.678929765886288, "step": 3300}, {"loss": 1.5973, "grad_norm": 0.5493269562721252, "learning_rate": 0.0002, "epoch": 3.690078037904125, "step": 3310}, {"loss": 1.5958, "grad_norm": 0.5721797943115234, "learning_rate": 0.0002, "epoch": 3.701226309921962, "step": 3320}, {"loss": 1.5098, "grad_norm": 0.6667633056640625, "learning_rate": 0.0002, "epoch": 3.712374581939799, "step": 3330}, {"loss": 1.5372, "grad_norm": 0.5713372826576233, "learning_rate": 0.0002, "epoch": 3.7235228539576366, "step": 3340}, {"loss": 1.5959, "grad_norm": 0.5925018191337585, "learning_rate": 0.0002, "epoch": 3.7346711259754737, "step": 3350}, {"loss": 1.5045, "grad_norm": 0.5660955905914307, "learning_rate": 0.0002, "epoch": 3.745819397993311, "step": 3360}, {"loss": 1.5465, "grad_norm": 0.5470759868621826, "learning_rate": 0.0002, "epoch": 3.7569676700111483, "step": 3370}, {"loss": 1.547, "grad_norm": 0.7612935900688171, "learning_rate": 0.0002, "epoch": 3.7681159420289854, "step": 3380}, {"loss": 1.6224, "grad_norm": 0.577467679977417, "learning_rate": 0.0002, "epoch": 3.779264214046823, "step": 3390}, {"loss": 1.5653, "grad_norm": 0.6125091910362244, "learning_rate": 0.0002, "epoch": 3.79041248606466, "step": 3400}, {"loss": 1.5463, "grad_norm": 0.590386152267456, "learning_rate": 0.0002, "epoch": 3.801560758082497, "step": 3410}, {"loss": 1.5944, "grad_norm": 0.5530361533164978, "learning_rate": 0.0002, "epoch": 3.8127090301003346, "step": 3420}, {"loss": 1.4797, "grad_norm": 0.5714079737663269, "learning_rate": 0.0002, "epoch": 3.8238573021181717, "step": 3430}, {"loss": 1.5324, "grad_norm": 0.9061086773872375, "learning_rate": 0.0002, "epoch": 3.8350055741360087, "step": 3440}, {"loss": 1.4513, "grad_norm": 0.6193320751190186, "learning_rate": 0.0002, "epoch": 3.8461538461538463, "step": 3450}, {"loss": 1.5537, "grad_norm": 0.5831704139709473, "learning_rate": 0.0002, "epoch": 3.8573021181716833, "step": 3460}, {"loss": 1.5144, "grad_norm": 0.5971192717552185, "learning_rate": 0.0002, "epoch": 3.868450390189521, "step": 3470}, {"loss": 1.484, "grad_norm": 0.6110154390335083, "learning_rate": 0.0002, "epoch": 3.879598662207358, "step": 3480}, {"loss": 1.5624, "grad_norm": 0.6644453406333923, "learning_rate": 0.0002, "epoch": 3.890746934225195, "step": 3490}, {"loss": 1.5422, "grad_norm": 0.6674908399581909, "learning_rate": 0.0002, "epoch": 3.901895206243032, "step": 3500}, {"loss": 1.579, "grad_norm": 0.5516519546508789, "learning_rate": 0.0002, "epoch": 3.9130434782608696, "step": 3510}, {"loss": 1.5964, "grad_norm": 0.6704319715499878, "learning_rate": 0.0002, "epoch": 3.9241917502787067, "step": 3520}, {"loss": 1.515, "grad_norm": 0.5820314288139343, "learning_rate": 0.0002, "epoch": 3.9353400222965442, "step": 3530}, {"loss": 1.6458, "grad_norm": 0.6931548714637756, "learning_rate": 0.0002, "epoch": 3.9464882943143813, "step": 3540}, {"loss": 1.5338, "grad_norm": 0.6085171103477478, "learning_rate": 0.0002, "epoch": 3.9576365663322184, "step": 3550}, {"loss": 1.5537, "grad_norm": 0.5973535776138306, "learning_rate": 0.0002, "epoch": 3.9687848383500555, "step": 3560}, {"loss": 1.5435, "grad_norm": 0.49761658906936646, "learning_rate": 0.0002, "epoch": 3.979933110367893, "step": 3570}, {"loss": 1.488, "grad_norm": 0.6282512545585632, "learning_rate": 0.0002, "epoch": 3.99108138238573, "step": 3580}, {"eval_loss": 1.8790398836135864, "eval_runtime": 37.9725, "eval_samples_per_second": 13.562, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 3588}, {"loss": 1.5025, "grad_norm": 0.6402973532676697, "learning_rate": 0.0002, "epoch": 4.002229654403568, "step": 3590}, {"loss": 1.3695, "grad_norm": 0.7791030406951904, "learning_rate": 0.0002, "epoch": 4.013377926421405, "step": 3600}, {"loss": 1.3545, "grad_norm": 0.7136624455451965, "learning_rate": 0.0002, "epoch": 4.024526198439242, "step": 3610}, {"loss": 1.3515, "grad_norm": 0.7608486413955688, "learning_rate": 0.0002, "epoch": 4.035674470457079, "step": 3620}, {"loss": 1.3067, "grad_norm": 0.7486591935157776, "learning_rate": 0.0002, "epoch": 4.046822742474917, "step": 3630}, {"loss": 1.3474, "grad_norm": 0.7576302289962769, "learning_rate": 0.0002, "epoch": 4.057971014492754, "step": 3640}, {"loss": 1.3036, "grad_norm": 0.7358254194259644, "learning_rate": 0.0002, "epoch": 4.069119286510591, "step": 3650}, {"loss": 1.3015, "grad_norm": 0.821326494216919, "learning_rate": 0.0002, "epoch": 4.080267558528428, "step": 3660}, {"loss": 1.4186, "grad_norm": 0.7996482253074646, "learning_rate": 0.0002, "epoch": 4.091415830546265, "step": 3670}, {"loss": 1.3671, "grad_norm": 0.8527022004127502, "learning_rate": 0.0002, "epoch": 4.102564102564102, "step": 3680}, {"loss": 1.3818, "grad_norm": 0.7313576340675354, "learning_rate": 0.0002, "epoch": 4.11371237458194, "step": 3690}, {"loss": 1.3307, "grad_norm": 0.7854588627815247, "learning_rate": 0.0002, "epoch": 4.124860646599777, "step": 3700}, {"loss": 1.4174, "grad_norm": 0.6588303446769714, "learning_rate": 0.0002, "epoch": 4.136008918617614, "step": 3710}, {"loss": 1.3674, "grad_norm": 0.7986254692077637, "learning_rate": 0.0002, "epoch": 4.147157190635451, "step": 3720}, {"loss": 1.3505, "grad_norm": 0.6864156126976013, "learning_rate": 0.0002, "epoch": 4.1583054626532885, "step": 3730}, {"loss": 1.2987, "grad_norm": 0.8197885155677795, "learning_rate": 0.0002, "epoch": 4.169453734671126, "step": 3740}, {"loss": 1.3565, "grad_norm": 0.7169402837753296, "learning_rate": 0.0002, "epoch": 4.1806020066889635, "step": 3750}, {"loss": 1.4388, "grad_norm": 0.7948839068412781, "learning_rate": 0.0002, "epoch": 4.191750278706801, "step": 3760}, {"loss": 1.4648, "grad_norm": 0.6775302290916443, "learning_rate": 0.0002, "epoch": 4.202898550724638, "step": 3770}, {"loss": 1.3238, "grad_norm": 0.8913543820381165, "learning_rate": 0.0002, "epoch": 4.214046822742475, "step": 3780}, {"loss": 1.4251, "grad_norm": 0.8046368360519409, "learning_rate": 0.0002, "epoch": 4.225195094760312, "step": 3790}, {"loss": 1.3542, "grad_norm": 0.9359563589096069, "learning_rate": 0.0002, "epoch": 4.236343366778149, "step": 3800}, {"loss": 1.3963, "grad_norm": 0.8012228608131409, "learning_rate": 0.0002, "epoch": 4.247491638795987, "step": 3810}, {"loss": 1.311, "grad_norm": 0.8405851125717163, "learning_rate": 0.0002, "epoch": 4.258639910813824, "step": 3820}, {"loss": 1.3903, "grad_norm": 0.7812899351119995, "learning_rate": 0.0002, "epoch": 4.269788182831661, "step": 3830}, {"loss": 1.4006, "grad_norm": 0.8192463517189026, "learning_rate": 0.0002, "epoch": 4.280936454849498, "step": 3840}, {"loss": 1.3663, "grad_norm": 0.6937220096588135, "learning_rate": 0.0002, "epoch": 4.292084726867335, "step": 3850}, {"loss": 1.391, "grad_norm": 0.7245703935623169, "learning_rate": 0.0002, "epoch": 4.303232998885173, "step": 3860}, {"loss": 1.3351, "grad_norm": 0.7816787362098694, "learning_rate": 0.0002, "epoch": 4.31438127090301, "step": 3870}, {"loss": 1.4316, "grad_norm": 0.7904975414276123, "learning_rate": 0.0002, "epoch": 4.325529542920847, "step": 3880}, {"loss": 1.4722, "grad_norm": 1.0394847393035889, "learning_rate": 0.0002, "epoch": 4.336677814938684, "step": 3890}, {"loss": 1.4574, "grad_norm": 0.7044078707695007, "learning_rate": 0.0002, "epoch": 4.3478260869565215, "step": 3900}, {"loss": 1.3185, "grad_norm": 0.8852819204330444, "learning_rate": 0.0002, "epoch": 4.358974358974359, "step": 3910}, {"loss": 1.3664, "grad_norm": 0.7712758779525757, "learning_rate": 0.0002, "epoch": 4.3701226309921966, "step": 3920}, {"loss": 1.3519, "grad_norm": 0.7677774429321289, "learning_rate": 0.0002, "epoch": 4.381270903010034, "step": 3930}, {"loss": 1.3693, "grad_norm": 0.7450921535491943, "learning_rate": 0.0002, "epoch": 4.392419175027871, "step": 3940}, {"loss": 1.392, "grad_norm": 0.7802795767784119, "learning_rate": 0.0002, "epoch": 4.403567447045708, "step": 3950}, {"loss": 1.3661, "grad_norm": 0.8976508378982544, "learning_rate": 0.0002, "epoch": 4.414715719063545, "step": 3960}, {"loss": 1.4124, "grad_norm": 0.8148922324180603, "learning_rate": 0.0002, "epoch": 4.425863991081382, "step": 3970}, {"loss": 1.3937, "grad_norm": 0.7490504384040833, "learning_rate": 0.0002, "epoch": 4.43701226309922, "step": 3980}, {"loss": 1.393, "grad_norm": 0.753652036190033, "learning_rate": 0.0002, "epoch": 4.448160535117057, "step": 3990}, {"loss": 1.3467, "grad_norm": 0.803986668586731, "learning_rate": 0.0002, "epoch": 4.459308807134894, "step": 4000}, {"loss": 1.3872, "grad_norm": 0.8643081784248352, "learning_rate": 0.0002, "epoch": 4.470457079152731, "step": 4010}, {"loss": 1.407, "grad_norm": 0.8298280835151672, "learning_rate": 0.0002, "epoch": 4.481605351170568, "step": 4020}, {"loss": 1.4555, "grad_norm": 0.705355703830719, "learning_rate": 0.0002, "epoch": 4.492753623188406, "step": 4030}, {"loss": 1.3646, "grad_norm": 0.7845711708068848, "learning_rate": 0.0002, "epoch": 4.503901895206243, "step": 4040}, {"loss": 1.3913, "grad_norm": 0.8056256175041199, "learning_rate": 0.0002, "epoch": 4.51505016722408, "step": 4050}, {"loss": 1.3716, "grad_norm": 0.7080171704292297, "learning_rate": 0.0002, "epoch": 4.5261984392419174, "step": 4060}, {"loss": 1.335, "grad_norm": 0.778388261795044, "learning_rate": 0.0002, "epoch": 4.5373467112597545, "step": 4070}, {"loss": 1.3921, "grad_norm": 0.7337639927864075, "learning_rate": 0.0002, "epoch": 4.548494983277592, "step": 4080}, {"loss": 1.369, "grad_norm": 0.815322756767273, "learning_rate": 0.0002, "epoch": 4.55964325529543, "step": 4090}, {"loss": 1.4509, "grad_norm": 0.8817179203033447, "learning_rate": 0.0002, "epoch": 4.570791527313267, "step": 4100}, {"loss": 1.344, "grad_norm": 0.7526060342788696, "learning_rate": 0.0002, "epoch": 4.581939799331104, "step": 4110}, {"loss": 1.4027, "grad_norm": 0.920465350151062, "learning_rate": 0.0002, "epoch": 4.593088071348941, "step": 4120}, {"loss": 1.3757, "grad_norm": 0.7509559392929077, "learning_rate": 0.0002, "epoch": 4.604236343366778, "step": 4130}, {"loss": 1.4064, "grad_norm": 0.799469530582428, "learning_rate": 0.0002, "epoch": 4.615384615384615, "step": 4140}, {"loss": 1.3689, "grad_norm": 0.8099892735481262, "learning_rate": 0.0002, "epoch": 4.626532887402453, "step": 4150}, {"loss": 1.3689, "grad_norm": 0.7790375351905823, "learning_rate": 0.0002, "epoch": 4.63768115942029, "step": 4160}, {"loss": 1.4626, "grad_norm": 0.8292977809906006, "learning_rate": 0.0002, "epoch": 4.648829431438127, "step": 4170}, {"loss": 1.4505, "grad_norm": 0.8312386274337769, "learning_rate": 0.0002, "epoch": 4.659977703455964, "step": 4180}, {"loss": 1.4301, "grad_norm": 0.7348753809928894, "learning_rate": 0.0002, "epoch": 4.671125975473801, "step": 4190}, {"loss": 1.4074, "grad_norm": 0.8006551265716553, "learning_rate": 0.0002, "epoch": 4.682274247491639, "step": 4200}, {"loss": 1.4349, "grad_norm": 0.8477752804756165, "learning_rate": 0.0002, "epoch": 4.693422519509476, "step": 4210}, {"loss": 1.3943, "grad_norm": 0.7056546211242676, "learning_rate": 0.0002, "epoch": 4.704570791527313, "step": 4220}, {"loss": 1.3415, "grad_norm": 0.7858873009681702, "learning_rate": 0.0002, "epoch": 4.7157190635451505, "step": 4230}, {"loss": 1.3644, "grad_norm": 0.6968740224838257, "learning_rate": 0.0002, "epoch": 4.7268673355629875, "step": 4240}, {"loss": 1.3594, "grad_norm": 0.7886689901351929, "learning_rate": 0.0002, "epoch": 4.738015607580825, "step": 4250}, {"loss": 1.3783, "grad_norm": 0.8935304880142212, "learning_rate": 0.0002, "epoch": 4.749163879598662, "step": 4260}, {"loss": 1.3664, "grad_norm": 0.8395553231239319, "learning_rate": 0.0002, "epoch": 4.7603121516165, "step": 4270}, {"loss": 1.4113, "grad_norm": 0.817263126373291, "learning_rate": 0.0002, "epoch": 4.771460423634337, "step": 4280}, {"loss": 1.4181, "grad_norm": 0.7912008166313171, "learning_rate": 0.0002, "epoch": 4.782608695652174, "step": 4290}, {"loss": 1.4369, "grad_norm": 0.6637866497039795, "learning_rate": 0.0002, "epoch": 4.793756967670011, "step": 4300}, {"loss": 1.4328, "grad_norm": 1.0709338188171387, "learning_rate": 0.0002, "epoch": 4.804905239687848, "step": 4310}, {"loss": 1.4635, "grad_norm": 0.8179698586463928, "learning_rate": 0.0002, "epoch": 4.816053511705686, "step": 4320}, {"loss": 1.3757, "grad_norm": 0.7952052354812622, "learning_rate": 0.0002, "epoch": 4.827201783723523, "step": 4330}, {"loss": 1.3954, "grad_norm": 0.7235367894172668, "learning_rate": 0.0002, "epoch": 4.83835005574136, "step": 4340}, {"loss": 1.4668, "grad_norm": 0.8484606742858887, "learning_rate": 0.0002, "epoch": 4.849498327759197, "step": 4350}, {"loss": 1.3898, "grad_norm": 0.7344942092895508, "learning_rate": 0.0002, "epoch": 4.860646599777034, "step": 4360}, {"loss": 1.4519, "grad_norm": 0.9718546867370605, "learning_rate": 0.0002, "epoch": 4.871794871794872, "step": 4370}, {"loss": 1.4187, "grad_norm": 0.8174259066581726, "learning_rate": 0.0002, "epoch": 4.882943143812709, "step": 4380}, {"loss": 1.3244, "grad_norm": 0.8097165822982788, "learning_rate": 0.0002, "epoch": 4.894091415830546, "step": 4390}, {"loss": 1.3689, "grad_norm": 0.756388783454895, "learning_rate": 0.0002, "epoch": 4.9052396878483835, "step": 4400}, {"loss": 1.4129, "grad_norm": 0.8324617743492126, "learning_rate": 0.0002, "epoch": 4.916387959866221, "step": 4410}, {"loss": 1.3662, "grad_norm": 0.8949803709983826, "learning_rate": 0.0002, "epoch": 4.927536231884058, "step": 4420}, {"loss": 1.4632, "grad_norm": 0.7663722634315491, "learning_rate": 0.0002, "epoch": 4.938684503901895, "step": 4430}, {"loss": 1.3829, "grad_norm": 0.7727946043014526, "learning_rate": 0.0002, "epoch": 4.949832775919733, "step": 4440}, {"loss": 1.4351, "grad_norm": 0.6872350573539734, "learning_rate": 0.0002, "epoch": 4.96098104793757, "step": 4450}, {"loss": 1.4552, "grad_norm": 0.754357099533081, "learning_rate": 0.0002, "epoch": 4.972129319955407, "step": 4460}, {"loss": 1.4, "grad_norm": 0.8068729639053345, "learning_rate": 0.0002, "epoch": 4.983277591973244, "step": 4470}, {"loss": 1.3891, "grad_norm": 0.8200556635856628, "learning_rate": 0.0002, "epoch": 4.994425863991081, "step": 4480}, {"eval_loss": 1.9543706178665161, "eval_runtime": 37.9369, "eval_samples_per_second": 13.575, "eval_steps_per_second": 1.713, "epoch": 5.0, "step": 4485}, {"loss": 1.3194, "grad_norm": 0.7499465942382812, "learning_rate": 0.0002, "epoch": 5.005574136008919, "step": 4490}, {"loss": 1.2143, "grad_norm": 1.030434489250183, "learning_rate": 0.0002, "epoch": 5.016722408026756, "step": 4500}, {"loss": 1.2408, "grad_norm": 0.8914631605148315, "learning_rate": 0.0002, "epoch": 5.027870680044593, "step": 4510}, {"loss": 1.1448, "grad_norm": 0.9902928471565247, "learning_rate": 0.0002, "epoch": 5.03901895206243, "step": 4520}, {"loss": 1.2401, "grad_norm": 0.8338701128959656, "learning_rate": 0.0002, "epoch": 5.050167224080267, "step": 4530}, {"loss": 1.1952, "grad_norm": 0.9440169334411621, "learning_rate": 0.0002, "epoch": 5.061315496098104, "step": 4540}, {"loss": 1.2196, "grad_norm": 0.8755099177360535, "learning_rate": 0.0002, "epoch": 5.072463768115942, "step": 4550}, {"loss": 1.1806, "grad_norm": 0.9145820140838623, "learning_rate": 0.0002, "epoch": 5.083612040133779, "step": 4560}, {"loss": 1.147, "grad_norm": 1.0068492889404297, "learning_rate": 0.0002, "epoch": 5.0947603121516165, "step": 4570}, {"loss": 1.2192, "grad_norm": 0.9184673428535461, "learning_rate": 0.0002, "epoch": 5.105908584169454, "step": 4580}, {"loss": 1.2948, "grad_norm": 1.1158655881881714, "learning_rate": 0.0002, "epoch": 5.117056856187291, "step": 4590}, {"loss": 1.2423, "grad_norm": 0.9685078263282776, "learning_rate": 0.0002, "epoch": 5.128205128205128, "step": 4600}, {"loss": 1.2654, "grad_norm": 1.0389559268951416, "learning_rate": 0.0002, "epoch": 5.139353400222966, "step": 4610}, {"loss": 1.1965, "grad_norm": 1.0294485092163086, "learning_rate": 0.0002, "epoch": 5.150501672240803, "step": 4620}, {"loss": 1.296, "grad_norm": 0.9368783235549927, "learning_rate": 0.0002, "epoch": 5.16164994425864, "step": 4630}, {"loss": 1.206, "grad_norm": 0.9724945425987244, "learning_rate": 0.0002, "epoch": 5.172798216276477, "step": 4640}, {"loss": 1.2319, "grad_norm": 0.876488447189331, "learning_rate": 0.0002, "epoch": 5.183946488294314, "step": 4650}, {"loss": 1.2506, "grad_norm": 0.9106290340423584, "learning_rate": 0.0002, "epoch": 5.195094760312152, "step": 4660}, {"loss": 1.2896, "grad_norm": 1.0924615859985352, "learning_rate": 0.0002, "epoch": 5.206243032329989, "step": 4670}, {"loss": 1.245, "grad_norm": 1.0379078388214111, "learning_rate": 0.0002, "epoch": 5.217391304347826, "step": 4680}, {"loss": 1.2155, "grad_norm": 0.9507831931114197, "learning_rate": 0.0002, "epoch": 5.228539576365663, "step": 4690}, {"loss": 1.2318, "grad_norm": 1.0408620834350586, "learning_rate": 0.0002, "epoch": 5.2396878483835, "step": 4700}, {"loss": 1.1819, "grad_norm": 0.9463635087013245, "learning_rate": 0.0002, "epoch": 5.250836120401337, "step": 4710}, {"loss": 1.1951, "grad_norm": 0.8919326663017273, "learning_rate": 0.0002, "epoch": 5.261984392419175, "step": 4720}, {"loss": 1.228, "grad_norm": 1.0364950895309448, "learning_rate": 0.0002, "epoch": 5.2731326644370125, "step": 4730}, {"loss": 1.2543, "grad_norm": 1.0225472450256348, "learning_rate": 0.0002, "epoch": 5.2842809364548495, "step": 4740}, {"loss": 1.1995, "grad_norm": 0.816410481929779, "learning_rate": 0.0002, "epoch": 5.295429208472687, "step": 4750}, {"loss": 1.3601, "grad_norm": 1.0793992280960083, "learning_rate": 0.0002, "epoch": 5.306577480490524, "step": 4760}, {"loss": 1.2424, "grad_norm": 1.0203443765640259, "learning_rate": 0.0002, "epoch": 5.317725752508361, "step": 4770}, {"loss": 1.239, "grad_norm": 1.0731306076049805, "learning_rate": 0.0002, "epoch": 5.328874024526199, "step": 4780}, {"loss": 1.2893, "grad_norm": 0.9282820224761963, "learning_rate": 0.0002, "epoch": 5.340022296544036, "step": 4790}, {"loss": 1.2159, "grad_norm": 0.9741092920303345, "learning_rate": 0.0002, "epoch": 5.351170568561873, "step": 4800}, {"loss": 1.24, "grad_norm": 1.0683609247207642, "learning_rate": 0.0002, "epoch": 5.36231884057971, "step": 4810}, {"loss": 1.2316, "grad_norm": 0.9035003781318665, "learning_rate": 0.0002, "epoch": 5.373467112597547, "step": 4820}, {"loss": 1.2615, "grad_norm": 1.0590119361877441, "learning_rate": 0.0002, "epoch": 5.384615384615385, "step": 4830}, {"loss": 1.2089, "grad_norm": 0.9782686829566956, "learning_rate": 0.0002, "epoch": 5.395763656633222, "step": 4840}, {"loss": 1.3019, "grad_norm": 1.036087155342102, "learning_rate": 0.0002, "epoch": 5.406911928651059, "step": 4850}, {"loss": 1.2475, "grad_norm": 0.9999949932098389, "learning_rate": 0.0002, "epoch": 5.418060200668896, "step": 4860}, {"loss": 1.3014, "grad_norm": 0.9094445109367371, "learning_rate": 0.0002, "epoch": 5.429208472686733, "step": 4870}, {"loss": 1.2013, "grad_norm": 0.9079708456993103, "learning_rate": 0.0002, "epoch": 5.44035674470457, "step": 4880}, {"loss": 1.2224, "grad_norm": 1.0426156520843506, "learning_rate": 0.0002, "epoch": 5.451505016722408, "step": 4890}, {"loss": 1.2812, "grad_norm": 1.0110737085342407, "learning_rate": 0.0002, "epoch": 5.4626532887402455, "step": 4900}, {"loss": 1.2178, "grad_norm": 1.0994000434875488, "learning_rate": 0.0002, "epoch": 5.4738015607580826, "step": 4910}, {"loss": 1.2019, "grad_norm": 0.8988325595855713, "learning_rate": 0.0002, "epoch": 5.48494983277592, "step": 4920}, {"loss": 1.2694, "grad_norm": 1.0705887079238892, "learning_rate": 0.0002, "epoch": 5.496098104793757, "step": 4930}, {"loss": 1.1659, "grad_norm": 1.0268803834915161, "learning_rate": 0.0002, "epoch": 5.507246376811594, "step": 4940}, {"loss": 1.2845, "grad_norm": 1.0129153728485107, "learning_rate": 0.0002, "epoch": 5.518394648829432, "step": 4950}, {"loss": 1.2081, "grad_norm": 1.122117280960083, "learning_rate": 0.0002, "epoch": 5.529542920847269, "step": 4960}, {"loss": 1.2828, "grad_norm": 1.0318635702133179, "learning_rate": 0.0002, "epoch": 5.540691192865106, "step": 4970}, {"loss": 1.2424, "grad_norm": 0.9340117573738098, "learning_rate": 0.0002, "epoch": 5.551839464882943, "step": 4980}, {"loss": 1.1541, "grad_norm": 0.9427006244659424, "learning_rate": 0.0002, "epoch": 5.56298773690078, "step": 4990}, {"loss": 1.2911, "grad_norm": 1.1786518096923828, "learning_rate": 0.0002, "epoch": 5.574136008918618, "step": 5000}, {"loss": 1.2279, "grad_norm": 1.045157551765442, "learning_rate": 0.0002, "epoch": 5.585284280936455, "step": 5010}, {"loss": 1.2269, "grad_norm": 1.0475151538848877, "learning_rate": 0.0002, "epoch": 5.596432552954292, "step": 5020}, {"loss": 1.2718, "grad_norm": 1.040969729423523, "learning_rate": 0.0002, "epoch": 5.607580824972129, "step": 5030}, {"loss": 1.2134, "grad_norm": 0.9610048532485962, "learning_rate": 0.0002, "epoch": 5.618729096989966, "step": 5040}, {"loss": 1.1657, "grad_norm": 0.9774818420410156, "learning_rate": 0.0002, "epoch": 5.6298773690078034, "step": 5050}, {"loss": 1.2788, "grad_norm": 0.8715312480926514, "learning_rate": 0.0002, "epoch": 5.641025641025641, "step": 5060}, {"loss": 1.3077, "grad_norm": 0.9484505653381348, "learning_rate": 0.0002, "epoch": 5.6521739130434785, "step": 5070}, {"loss": 1.2787, "grad_norm": 0.8292845487594604, "learning_rate": 0.0002, "epoch": 5.663322185061316, "step": 5080}, {"loss": 1.2357, "grad_norm": 0.9876886606216431, "learning_rate": 0.0002, "epoch": 5.674470457079153, "step": 5090}, {"loss": 1.2864, "grad_norm": 0.9899171590805054, "learning_rate": 0.0002, "epoch": 5.68561872909699, "step": 5100}, {"loss": 1.2747, "grad_norm": 0.9693286418914795, "learning_rate": 0.0002, "epoch": 5.696767001114827, "step": 5110}, {"loss": 1.1952, "grad_norm": 0.958905816078186, "learning_rate": 0.0002, "epoch": 5.707915273132665, "step": 5120}, {"loss": 1.2889, "grad_norm": 0.9924837350845337, "learning_rate": 0.0002, "epoch": 5.719063545150502, "step": 5130}, {"loss": 1.3057, "grad_norm": 0.9551714062690735, "learning_rate": 0.0002, "epoch": 5.730211817168339, "step": 5140}, {"loss": 1.2643, "grad_norm": 1.0407027006149292, "learning_rate": 0.0002, "epoch": 5.741360089186176, "step": 5150}, {"loss": 1.1833, "grad_norm": 0.9688791036605835, "learning_rate": 0.0002, "epoch": 5.752508361204013, "step": 5160}, {"loss": 1.1424, "grad_norm": 1.0091899633407593, "learning_rate": 0.0002, "epoch": 5.763656633221851, "step": 5170}, {"loss": 1.2575, "grad_norm": 0.9393984079360962, "learning_rate": 0.0002, "epoch": 5.774804905239688, "step": 5180}, {"loss": 1.2177, "grad_norm": 1.1439075469970703, "learning_rate": 0.0002, "epoch": 5.785953177257525, "step": 5190}, {"loss": 1.3355, "grad_norm": 1.0178622007369995, "learning_rate": 0.0002, "epoch": 5.797101449275362, "step": 5200}, {"loss": 1.3317, "grad_norm": 0.8440285921096802, "learning_rate": 0.0002, "epoch": 5.808249721293199, "step": 5210}, {"loss": 1.3097, "grad_norm": 0.856838583946228, "learning_rate": 0.0002, "epoch": 5.8193979933110365, "step": 5220}, {"loss": 1.3109, "grad_norm": 0.8676707148551941, "learning_rate": 0.0002, "epoch": 5.8305462653288735, "step": 5230}, {"loss": 1.248, "grad_norm": 1.1034743785858154, "learning_rate": 0.0002, "epoch": 5.8416945373467115, "step": 5240}, {"loss": 1.2473, "grad_norm": 0.9631003737449646, "learning_rate": 0.0002, "epoch": 5.852842809364549, "step": 5250}, {"loss": 1.2693, "grad_norm": 1.0478793382644653, "learning_rate": 0.0002, "epoch": 5.863991081382386, "step": 5260}, {"loss": 1.2349, "grad_norm": 0.9819806218147278, "learning_rate": 0.0002, "epoch": 5.875139353400223, "step": 5270}, {"loss": 1.2817, "grad_norm": 0.8572421073913574, "learning_rate": 0.0002, "epoch": 5.88628762541806, "step": 5280}, {"loss": 1.246, "grad_norm": 0.9328814148902893, "learning_rate": 0.0002, "epoch": 5.897435897435898, "step": 5290}, {"loss": 1.3016, "grad_norm": 1.000305414199829, "learning_rate": 0.0002, "epoch": 5.908584169453735, "step": 5300}, {"loss": 1.3681, "grad_norm": 1.1006377935409546, "learning_rate": 0.0002, "epoch": 5.919732441471572, "step": 5310}, {"loss": 1.3317, "grad_norm": 0.963198721408844, "learning_rate": 0.0002, "epoch": 5.930880713489409, "step": 5320}, {"loss": 1.2713, "grad_norm": 0.8952236175537109, "learning_rate": 0.0002, "epoch": 5.942028985507246, "step": 5330}, {"loss": 1.2536, "grad_norm": 1.0945496559143066, "learning_rate": 0.0002, "epoch": 5.953177257525084, "step": 5340}, {"loss": 1.2768, "grad_norm": 1.0053467750549316, "learning_rate": 0.0002, "epoch": 5.964325529542921, "step": 5350}, {"loss": 1.3075, "grad_norm": 1.032088279724121, "learning_rate": 0.0002, "epoch": 5.975473801560758, "step": 5360}, {"loss": 1.3278, "grad_norm": 1.1068958044052124, "learning_rate": 0.0002, "epoch": 5.986622073578595, "step": 5370}, {"loss": 1.2468, "grad_norm": 1.0064235925674438, "learning_rate": 0.0002, "epoch": 5.997770345596432, "step": 5380}, {"eval_loss": 2.0690135955810547, "eval_runtime": 38.1748, "eval_samples_per_second": 13.491, "eval_steps_per_second": 1.703, "epoch": 6.0, "step": 5382}, {"loss": 1.1062, "grad_norm": 0.9700132608413696, "learning_rate": 0.0002, "epoch": 6.0089186176142695, "step": 5390}, {"loss": 1.097, "grad_norm": 1.159369707107544, "learning_rate": 0.0002, "epoch": 6.0200668896321075, "step": 5400}, {"loss": 1.0646, "grad_norm": 1.332871913909912, "learning_rate": 0.0002, "epoch": 6.0312151616499445, "step": 5410}, {"loss": 1.0882, "grad_norm": 1.2239890098571777, "learning_rate": 0.0002, "epoch": 6.042363433667782, "step": 5420}, {"loss": 1.0505, "grad_norm": 1.5238478183746338, "learning_rate": 0.0002, "epoch": 6.053511705685619, "step": 5430}, {"loss": 1.1423, "grad_norm": 1.24699068069458, "learning_rate": 0.0002, "epoch": 6.064659977703456, "step": 5440}, {"loss": 1.0789, "grad_norm": 1.0891860723495483, "learning_rate": 0.0002, "epoch": 6.075808249721293, "step": 5450}, {"loss": 1.1439, "grad_norm": 1.2695465087890625, "learning_rate": 0.0002, "epoch": 6.086956521739131, "step": 5460}, {"loss": 1.0728, "grad_norm": 1.0630067586898804, "learning_rate": 0.0002, "epoch": 6.098104793756968, "step": 5470}, {"loss": 1.0391, "grad_norm": 0.9666808247566223, "learning_rate": 0.0002, "epoch": 6.109253065774805, "step": 5480}, {"loss": 1.1159, "grad_norm": 0.8925976157188416, "learning_rate": 0.0002, "epoch": 6.120401337792642, "step": 5490}, {"loss": 1.0371, "grad_norm": 1.0824475288391113, "learning_rate": 0.0002, "epoch": 6.131549609810479, "step": 5500}, {"loss": 1.1568, "grad_norm": 1.2315316200256348, "learning_rate": 0.0002, "epoch": 6.142697881828316, "step": 5510}, {"loss": 1.0896, "grad_norm": 1.2484779357910156, "learning_rate": 0.0002, "epoch": 6.153846153846154, "step": 5520}, {"loss": 1.0368, "grad_norm": 1.2468485832214355, "learning_rate": 0.0002, "epoch": 6.164994425863991, "step": 5530}, {"loss": 1.1368, "grad_norm": 1.0837156772613525, "learning_rate": 0.0002, "epoch": 6.176142697881828, "step": 5540}, {"loss": 1.1042, "grad_norm": 1.1650336980819702, "learning_rate": 0.0002, "epoch": 6.187290969899665, "step": 5550}, {"loss": 1.0495, "grad_norm": 1.2004241943359375, "learning_rate": 0.0002, "epoch": 6.1984392419175025, "step": 5560}, {"loss": 1.023, "grad_norm": 1.0223793983459473, "learning_rate": 0.0002, "epoch": 6.20958751393534, "step": 5570}, {"loss": 1.0837, "grad_norm": 1.4045847654342651, "learning_rate": 0.0002, "epoch": 6.2207357859531776, "step": 5580}, {"loss": 1.1168, "grad_norm": 1.3042256832122803, "learning_rate": 0.0002, "epoch": 6.231884057971015, "step": 5590}, {"loss": 1.0138, "grad_norm": 1.1762887239456177, "learning_rate": 0.0002, "epoch": 6.243032329988852, "step": 5600}, {"loss": 1.1651, "grad_norm": 1.1739851236343384, "learning_rate": 0.0002, "epoch": 6.254180602006689, "step": 5610}, {"loss": 1.1004, "grad_norm": 1.2904260158538818, "learning_rate": 0.0002, "epoch": 6.265328874024526, "step": 5620}, {"loss": 1.0803, "grad_norm": 1.3218393325805664, "learning_rate": 0.0002, "epoch": 6.276477146042364, "step": 5630}, {"loss": 1.0876, "grad_norm": 1.241175889968872, "learning_rate": 0.0002, "epoch": 6.287625418060201, "step": 5640}, {"loss": 1.128, "grad_norm": 1.2916349172592163, "learning_rate": 0.0002, "epoch": 6.298773690078038, "step": 5650}, {"loss": 1.1197, "grad_norm": 1.5129448175430298, "learning_rate": 0.0002, "epoch": 6.309921962095875, "step": 5660}, {"loss": 1.0723, "grad_norm": 1.0297393798828125, "learning_rate": 0.0002, "epoch": 6.321070234113712, "step": 5670}, {"loss": 1.0513, "grad_norm": 1.1127521991729736, "learning_rate": 0.0002, "epoch": 6.332218506131549, "step": 5680}, {"loss": 1.0305, "grad_norm": 1.0972518920898438, "learning_rate": 0.0002, "epoch": 6.343366778149387, "step": 5690}, {"loss": 1.0616, "grad_norm": 1.4237337112426758, "learning_rate": 0.0002, "epoch": 6.354515050167224, "step": 5700}, {"loss": 1.0924, "grad_norm": 1.121502161026001, "learning_rate": 0.0002, "epoch": 6.365663322185061, "step": 5710}, {"loss": 1.0208, "grad_norm": 1.1007202863693237, "learning_rate": 0.0002, "epoch": 6.3768115942028984, "step": 5720}, {"loss": 1.1178, "grad_norm": 1.1609363555908203, "learning_rate": 0.0002, "epoch": 6.3879598662207355, "step": 5730}, {"loss": 1.1068, "grad_norm": 1.3008915185928345, "learning_rate": 0.0002, "epoch": 6.399108138238573, "step": 5740}, {"loss": 1.1647, "grad_norm": 1.184460163116455, "learning_rate": 0.0002, "epoch": 6.410256410256411, "step": 5750}, {"loss": 1.109, "grad_norm": 1.2092398405075073, "learning_rate": 0.0002, "epoch": 6.421404682274248, "step": 5760}, {"loss": 1.093, "grad_norm": 1.2273279428482056, "learning_rate": 0.0002, "epoch": 6.432552954292085, "step": 5770}, {"loss": 1.1171, "grad_norm": 1.0721677541732788, "learning_rate": 0.0002, "epoch": 6.443701226309922, "step": 5780}, {"loss": 1.0585, "grad_norm": 1.1679279804229736, "learning_rate": 0.0002, "epoch": 6.454849498327759, "step": 5790}, {"loss": 1.0795, "grad_norm": 1.3658736944198608, "learning_rate": 0.0002, "epoch": 6.465997770345597, "step": 5800}, {"loss": 1.0951, "grad_norm": 1.2440944910049438, "learning_rate": 0.0002, "epoch": 6.477146042363434, "step": 5810}, {"loss": 1.0815, "grad_norm": 1.1838182210922241, "learning_rate": 0.0002, "epoch": 6.488294314381271, "step": 5820}, {"loss": 1.0543, "grad_norm": 1.1993956565856934, "learning_rate": 0.0002, "epoch": 6.499442586399108, "step": 5830}, {"loss": 1.1587, "grad_norm": 1.1028285026550293, "learning_rate": 0.0002, "epoch": 6.510590858416945, "step": 5840}, {"loss": 1.1245, "grad_norm": 1.2117441892623901, "learning_rate": 0.0002, "epoch": 6.521739130434782, "step": 5850}, {"loss": 1.1237, "grad_norm": 1.2012946605682373, "learning_rate": 0.0002, "epoch": 6.53288740245262, "step": 5860}, {"loss": 1.1038, "grad_norm": 1.2491029500961304, "learning_rate": 0.0002, "epoch": 6.544035674470457, "step": 5870}, {"loss": 1.1183, "grad_norm": 1.4130326509475708, "learning_rate": 0.0002, "epoch": 6.555183946488294, "step": 5880}, {"loss": 1.1094, "grad_norm": 1.2596930265426636, "learning_rate": 0.0002, "epoch": 6.5663322185061315, "step": 5890}, {"loss": 1.1445, "grad_norm": 1.32266104221344, "learning_rate": 0.0002, "epoch": 6.5774804905239685, "step": 5900}, {"loss": 1.169, "grad_norm": 1.3093374967575073, "learning_rate": 0.0002, "epoch": 6.588628762541806, "step": 5910}, {"loss": 1.161, "grad_norm": 1.0436453819274902, "learning_rate": 0.0002, "epoch": 6.599777034559644, "step": 5920}, {"loss": 1.1358, "grad_norm": 1.064468502998352, "learning_rate": 0.0002, "epoch": 6.610925306577481, "step": 5930}, {"loss": 1.1443, "grad_norm": 1.2561777830123901, "learning_rate": 0.0002, "epoch": 6.622073578595318, "step": 5940}, {"loss": 1.1088, "grad_norm": 1.2759621143341064, "learning_rate": 0.0002, "epoch": 6.633221850613155, "step": 5950}, {"loss": 1.1103, "grad_norm": 1.0602868795394897, "learning_rate": 0.0002, "epoch": 6.644370122630992, "step": 5960}, {"loss": 1.2081, "grad_norm": 1.2336751222610474, "learning_rate": 0.0002, "epoch": 6.65551839464883, "step": 5970}, {"loss": 1.1264, "grad_norm": 1.1773011684417725, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 5980}, {"loss": 1.1641, "grad_norm": 1.0779681205749512, "learning_rate": 0.0002, "epoch": 6.677814938684504, "step": 5990}, {"loss": 1.1034, "grad_norm": 1.396223783493042, "learning_rate": 0.0002, "epoch": 6.688963210702341, "step": 6000}, {"loss": 1.1418, "grad_norm": 1.2238768339157104, "learning_rate": 0.0002, "epoch": 6.700111482720178, "step": 6010}, {"loss": 1.098, "grad_norm": 1.1152666807174683, "learning_rate": 0.0002, "epoch": 6.711259754738015, "step": 6020}, {"loss": 1.1602, "grad_norm": 1.2376031875610352, "learning_rate": 0.0002, "epoch": 6.722408026755852, "step": 6030}, {"loss": 1.1062, "grad_norm": 1.0868488550186157, "learning_rate": 0.0002, "epoch": 6.73355629877369, "step": 6040}, {"loss": 1.1366, "grad_norm": 1.265913724899292, "learning_rate": 0.0002, "epoch": 6.744704570791527, "step": 6050}, {"loss": 1.0959, "grad_norm": 1.1551072597503662, "learning_rate": 0.0002, "epoch": 6.7558528428093645, "step": 6060}, {"loss": 1.1395, "grad_norm": 1.0813109874725342, "learning_rate": 0.0002, "epoch": 6.767001114827202, "step": 6070}, {"loss": 1.1047, "grad_norm": 1.2367933988571167, "learning_rate": 0.0002, "epoch": 6.778149386845039, "step": 6080}, {"loss": 1.0803, "grad_norm": 1.1612437963485718, "learning_rate": 0.0002, "epoch": 6.789297658862877, "step": 6090}, {"loss": 1.1462, "grad_norm": 1.2715837955474854, "learning_rate": 0.0002, "epoch": 6.800445930880714, "step": 6100}, {"loss": 1.1371, "grad_norm": 1.1385036706924438, "learning_rate": 0.0002, "epoch": 6.811594202898551, "step": 6110}, {"loss": 1.137, "grad_norm": 1.4322341680526733, "learning_rate": 0.0002, "epoch": 6.822742474916388, "step": 6120}, {"loss": 1.1571, "grad_norm": 1.2975877523422241, "learning_rate": 0.0002, "epoch": 6.833890746934225, "step": 6130}, {"loss": 1.1592, "grad_norm": 1.0241044759750366, "learning_rate": 0.0002, "epoch": 6.845039018952063, "step": 6140}, {"loss": 1.1369, "grad_norm": 1.352594017982483, "learning_rate": 0.0002, "epoch": 6.8561872909699, "step": 6150}, {"loss": 1.112, "grad_norm": 1.1166167259216309, "learning_rate": 0.0002, "epoch": 6.867335562987737, "step": 6160}, {"loss": 1.1409, "grad_norm": 1.1596941947937012, "learning_rate": 0.0002, "epoch": 6.878483835005574, "step": 6170}, {"loss": 1.1258, "grad_norm": 1.5753912925720215, "learning_rate": 0.0002, "epoch": 6.889632107023411, "step": 6180}, {"loss": 1.1154, "grad_norm": 1.1857494115829468, "learning_rate": 0.0002, "epoch": 6.900780379041248, "step": 6190}, {"loss": 1.137, "grad_norm": 1.1507896184921265, "learning_rate": 0.0002, "epoch": 6.911928651059085, "step": 6200}, {"loss": 1.1532, "grad_norm": 1.5194647312164307, "learning_rate": 0.0002, "epoch": 6.923076923076923, "step": 6210}, {"loss": 1.1315, "grad_norm": 1.1627732515335083, "learning_rate": 0.0002, "epoch": 6.93422519509476, "step": 6220}, {"loss": 1.1079, "grad_norm": 1.1929609775543213, "learning_rate": 0.0002, "epoch": 6.9453734671125975, "step": 6230}, {"loss": 1.1331, "grad_norm": 1.2704664468765259, "learning_rate": 0.0002, "epoch": 6.956521739130435, "step": 6240}, {"loss": 1.1177, "grad_norm": 1.1791198253631592, "learning_rate": 0.0002, "epoch": 6.967670011148272, "step": 6250}, {"loss": 1.1152, "grad_norm": 1.1948790550231934, "learning_rate": 0.0002, "epoch": 6.97881828316611, "step": 6260}, {"loss": 1.1213, "grad_norm": 1.222116231918335, "learning_rate": 0.0002, "epoch": 6.989966555183947, "step": 6270}, {"eval_loss": 2.174532890319824, "eval_runtime": 38.0962, "eval_samples_per_second": 13.518, "eval_steps_per_second": 1.706, "epoch": 7.0, "step": 6279}, {"loss": 1.1558, "grad_norm": 1.0389306545257568, "learning_rate": 0.0002, "epoch": 7.001114827201784, "step": 6280}, {"loss": 0.9833, "grad_norm": 1.5281798839569092, "learning_rate": 0.0002, "epoch": 7.012263099219621, "step": 6290}, {"loss": 0.9557, "grad_norm": 1.097888708114624, "learning_rate": 0.0002, "epoch": 7.023411371237458, "step": 6300}, {"loss": 0.9435, "grad_norm": 1.4041006565093994, "learning_rate": 0.0002, "epoch": 7.034559643255295, "step": 6310}, {"loss": 0.9183, "grad_norm": 1.3070768117904663, "learning_rate": 0.0002, "epoch": 7.045707915273133, "step": 6320}, {"loss": 0.8845, "grad_norm": 1.5640852451324463, "learning_rate": 0.0002, "epoch": 7.05685618729097, "step": 6330}, {"loss": 0.874, "grad_norm": 1.5929399728775024, "learning_rate": 0.0002, "epoch": 7.068004459308807, "step": 6340}, {"loss": 0.8461, "grad_norm": 1.2621946334838867, "learning_rate": 0.0002, "epoch": 7.079152731326644, "step": 6350}, {"loss": 0.9601, "grad_norm": 1.9438022375106812, "learning_rate": 0.0002, "epoch": 7.090301003344481, "step": 6360}, {"loss": 0.9348, "grad_norm": 1.3711209297180176, "learning_rate": 0.0002, "epoch": 7.101449275362318, "step": 6370}, {"loss": 0.9318, "grad_norm": 1.2935353517532349, "learning_rate": 0.0002, "epoch": 7.112597547380156, "step": 6380}, {"loss": 0.9687, "grad_norm": 1.4326812028884888, "learning_rate": 0.0002, "epoch": 7.1237458193979935, "step": 6390}, {"loss": 0.9552, "grad_norm": 1.604068398475647, "learning_rate": 0.0002, "epoch": 7.1348940914158305, "step": 6400}, {"loss": 0.9692, "grad_norm": 1.5581567287445068, "learning_rate": 0.0002, "epoch": 7.146042363433668, "step": 6410}, {"loss": 0.9209, "grad_norm": 1.3148343563079834, "learning_rate": 0.0002, "epoch": 7.157190635451505, "step": 6420}, {"loss": 0.9401, "grad_norm": 1.3319238424301147, "learning_rate": 0.0002, "epoch": 7.168338907469343, "step": 6430}, {"loss": 0.9306, "grad_norm": 1.3741648197174072, "learning_rate": 0.0002, "epoch": 7.17948717948718, "step": 6440}, {"loss": 0.9681, "grad_norm": 1.2071956396102905, "learning_rate": 0.0002, "epoch": 7.190635451505017, "step": 6450}, {"loss": 0.943, "grad_norm": 1.4183731079101562, "learning_rate": 0.0002, "epoch": 7.201783723522854, "step": 6460}, {"loss": 0.9611, "grad_norm": 1.4467699527740479, "learning_rate": 0.0002, "epoch": 7.212931995540691, "step": 6470}, {"loss": 0.9784, "grad_norm": 1.3801071643829346, "learning_rate": 0.0002, "epoch": 7.224080267558528, "step": 6480}, {"loss": 0.9463, "grad_norm": 1.6222909688949585, "learning_rate": 0.0002, "epoch": 7.235228539576366, "step": 6490}, {"loss": 0.9701, "grad_norm": 1.6431424617767334, "learning_rate": 0.0002, "epoch": 7.246376811594203, "step": 6500}, {"loss": 0.937, "grad_norm": 1.4911304712295532, "learning_rate": 0.0002, "epoch": 7.25752508361204, "step": 6510}, {"loss": 0.933, "grad_norm": 1.3448628187179565, "learning_rate": 0.0002, "epoch": 7.268673355629877, "step": 6520}, {"loss": 0.9399, "grad_norm": 1.2078956365585327, "learning_rate": 0.0002, "epoch": 7.279821627647714, "step": 6530}, {"loss": 0.9865, "grad_norm": 1.6037310361862183, "learning_rate": 0.0002, "epoch": 7.290969899665551, "step": 6540}, {"loss": 0.9763, "grad_norm": 1.541955828666687, "learning_rate": 0.0002, "epoch": 7.302118171683389, "step": 6550}, {"loss": 0.8995, "grad_norm": 1.5351279973983765, "learning_rate": 0.0002, "epoch": 7.3132664437012265, "step": 6560}, {"loss": 0.9742, "grad_norm": 1.4032648801803589, "learning_rate": 0.0002, "epoch": 7.3244147157190636, "step": 6570}, {"loss": 0.9687, "grad_norm": 1.1339422464370728, "learning_rate": 0.0002, "epoch": 7.335562987736901, "step": 6580}, {"loss": 0.9896, "grad_norm": 1.2702211141586304, "learning_rate": 0.0002, "epoch": 7.346711259754738, "step": 6590}, {"loss": 0.9823, "grad_norm": 1.2987596988677979, "learning_rate": 0.0002, "epoch": 7.357859531772576, "step": 6600}, {"loss": 0.9479, "grad_norm": 1.506354808807373, "learning_rate": 0.0002, "epoch": 7.369007803790413, "step": 6610}, {"loss": 0.979, "grad_norm": 1.2649177312850952, "learning_rate": 0.0002, "epoch": 7.38015607580825, "step": 6620}, {"loss": 0.9905, "grad_norm": 1.4871227741241455, "learning_rate": 0.0002, "epoch": 7.391304347826087, "step": 6630}, {"loss": 0.9855, "grad_norm": 1.6173475980758667, "learning_rate": 0.0002, "epoch": 7.402452619843924, "step": 6640}, {"loss": 0.9615, "grad_norm": 1.2726142406463623, "learning_rate": 0.0002, "epoch": 7.413600891861761, "step": 6650}, {"loss": 0.9775, "grad_norm": 1.4965415000915527, "learning_rate": 0.0002, "epoch": 7.424749163879599, "step": 6660}, {"loss": 0.9776, "grad_norm": 1.4861866235733032, "learning_rate": 0.0002, "epoch": 7.435897435897436, "step": 6670}, {"loss": 0.9861, "grad_norm": 1.6286227703094482, "learning_rate": 0.0002, "epoch": 7.447045707915273, "step": 6680}, {"loss": 1.0054, "grad_norm": 1.5688917636871338, "learning_rate": 0.0002, "epoch": 7.45819397993311, "step": 6690}, {"loss": 0.9509, "grad_norm": 1.2886908054351807, "learning_rate": 0.0002, "epoch": 7.469342251950947, "step": 6700}, {"loss": 0.9773, "grad_norm": 1.5951329469680786, "learning_rate": 0.0002, "epoch": 7.4804905239687844, "step": 6710}, {"loss": 1.0291, "grad_norm": 1.4492952823638916, "learning_rate": 0.0002, "epoch": 7.491638795986622, "step": 6720}, {"loss": 1.0378, "grad_norm": 1.6316872835159302, "learning_rate": 0.0002, "epoch": 7.5027870680044595, "step": 6730}, {"loss": 0.9678, "grad_norm": 1.471291422843933, "learning_rate": 0.0002, "epoch": 7.513935340022297, "step": 6740}, {"loss": 0.9368, "grad_norm": 1.5187207460403442, "learning_rate": 0.0002, "epoch": 7.525083612040134, "step": 6750}, {"loss": 1.0068, "grad_norm": 1.5191140174865723, "learning_rate": 0.0002, "epoch": 7.536231884057971, "step": 6760}, {"loss": 0.9835, "grad_norm": 1.402166485786438, "learning_rate": 0.0002, "epoch": 7.547380156075809, "step": 6770}, {"loss": 0.9712, "grad_norm": 1.4154515266418457, "learning_rate": 0.0002, "epoch": 7.558528428093646, "step": 6780}, {"loss": 0.9181, "grad_norm": 1.530374526977539, "learning_rate": 0.0002, "epoch": 7.569676700111483, "step": 6790}, {"loss": 0.9524, "grad_norm": 1.335096836090088, "learning_rate": 0.0002, "epoch": 7.58082497212932, "step": 6800}, {"loss": 0.922, "grad_norm": 1.5730568170547485, "learning_rate": 0.0002, "epoch": 7.591973244147157, "step": 6810}, {"loss": 0.9806, "grad_norm": 1.4692550897598267, "learning_rate": 0.0002, "epoch": 7.603121516164994, "step": 6820}, {"loss": 0.9719, "grad_norm": 1.3645410537719727, "learning_rate": 0.0002, "epoch": 7.614269788182831, "step": 6830}, {"loss": 1.0284, "grad_norm": 1.5139234066009521, "learning_rate": 0.0002, "epoch": 7.625418060200669, "step": 6840}, {"loss": 1.007, "grad_norm": 1.4001535177230835, "learning_rate": 0.0002, "epoch": 7.636566332218506, "step": 6850}, {"loss": 1.0315, "grad_norm": 1.5518683195114136, "learning_rate": 0.0002, "epoch": 7.647714604236343, "step": 6860}, {"loss": 1.0058, "grad_norm": 1.6151013374328613, "learning_rate": 0.0002, "epoch": 7.65886287625418, "step": 6870}, {"loss": 0.9789, "grad_norm": 1.5577940940856934, "learning_rate": 0.0002, "epoch": 7.6700111482720175, "step": 6880}, {"loss": 0.9728, "grad_norm": 1.2788935899734497, "learning_rate": 0.0002, "epoch": 7.681159420289855, "step": 6890}, {"loss": 0.9004, "grad_norm": 1.3274600505828857, "learning_rate": 0.0002, "epoch": 7.6923076923076925, "step": 6900}, {"loss": 0.9739, "grad_norm": 1.3590648174285889, "learning_rate": 0.0002, "epoch": 7.70345596432553, "step": 6910}, {"loss": 0.9639, "grad_norm": 1.4309452772140503, "learning_rate": 0.0002, "epoch": 7.714604236343367, "step": 6920}, {"loss": 0.9725, "grad_norm": 1.3435392379760742, "learning_rate": 0.0002, "epoch": 7.725752508361204, "step": 6930}, {"loss": 1.0013, "grad_norm": 1.519593358039856, "learning_rate": 0.0002, "epoch": 7.736900780379042, "step": 6940}, {"loss": 0.9149, "grad_norm": 1.1542080640792847, "learning_rate": 0.0002, "epoch": 7.748049052396879, "step": 6950}, {"loss": 1.055, "grad_norm": 1.3358652591705322, "learning_rate": 0.0002, "epoch": 7.759197324414716, "step": 6960}, {"loss": 0.9777, "grad_norm": 1.526912808418274, "learning_rate": 0.0002, "epoch": 7.770345596432553, "step": 6970}, {"loss": 0.9855, "grad_norm": 1.303989052772522, "learning_rate": 0.0002, "epoch": 7.78149386845039, "step": 6980}, {"loss": 1.0142, "grad_norm": 1.3185025453567505, "learning_rate": 0.0002, "epoch": 7.792642140468227, "step": 6990}, {"loss": 1.0294, "grad_norm": 1.3556475639343262, "learning_rate": 0.0002, "epoch": 7.803790412486064, "step": 7000}, {"loss": 1.0184, "grad_norm": 1.3264387845993042, "learning_rate": 0.0002, "epoch": 7.814938684503902, "step": 7010}, {"loss": 0.9507, "grad_norm": 1.4610573053359985, "learning_rate": 0.0002, "epoch": 7.826086956521739, "step": 7020}, {"loss": 0.9847, "grad_norm": 1.39540433883667, "learning_rate": 0.0002, "epoch": 7.837235228539576, "step": 7030}, {"loss": 1.0302, "grad_norm": 1.5537383556365967, "learning_rate": 0.0002, "epoch": 7.848383500557413, "step": 7040}, {"loss": 0.9941, "grad_norm": 1.6064108610153198, "learning_rate": 0.0002, "epoch": 7.8595317725752505, "step": 7050}, {"loss": 1.0205, "grad_norm": 1.4497601985931396, "learning_rate": 0.0002, "epoch": 7.8706800445930885, "step": 7060}, {"loss": 1.0416, "grad_norm": 1.3896540403366089, "learning_rate": 0.0002, "epoch": 7.8818283166109255, "step": 7070}, {"loss": 0.9959, "grad_norm": 1.4320734739303589, "learning_rate": 0.0002, "epoch": 7.892976588628763, "step": 7080}, {"loss": 1.0181, "grad_norm": 1.3116543292999268, "learning_rate": 0.0002, "epoch": 7.9041248606466, "step": 7090}, {"loss": 1.0162, "grad_norm": 1.290254831314087, "learning_rate": 0.0002, "epoch": 7.915273132664437, "step": 7100}, {"loss": 1.0486, "grad_norm": 1.4764007329940796, "learning_rate": 0.0002, "epoch": 7.926421404682275, "step": 7110}, {"loss": 1.0126, "grad_norm": 1.4759361743927002, "learning_rate": 0.0002, "epoch": 7.937569676700112, "step": 7120}, {"loss": 1.0223, "grad_norm": 1.4465186595916748, "learning_rate": 0.0002, "epoch": 7.948717948717949, "step": 7130}, {"loss": 0.9883, "grad_norm": 1.333365797996521, "learning_rate": 0.0002, "epoch": 7.959866220735786, "step": 7140}, {"loss": 0.9918, "grad_norm": 1.5393798351287842, "learning_rate": 0.0002, "epoch": 7.971014492753623, "step": 7150}, {"loss": 1.0166, "grad_norm": 1.3893442153930664, "learning_rate": 0.0002, "epoch": 7.98216276477146, "step": 7160}, {"loss": 1.052, "grad_norm": 1.4354097843170166, "learning_rate": 0.0002, "epoch": 7.993311036789297, "step": 7170}]}