diff --git a/.gitattributes b/.gitattributes index 77b43a973beb1bd1e6db4db73fbda4efe07704f2..66f35dedbdb39c5e1fb696248effc199deade94b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1129,3 +1129,12 @@ gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1 gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-600-sd-10000/checkpoint-55/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-600-sd-10000/checkpoint-82/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-600-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b836aa0c22049e4ee82137ada87f84096f5266f9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c30f96f9302bef1ec0aaf47e94804c44aad59092b23b0b27bf9d289790b4ea01 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..22b70dd537050669baa485d531c0d7d7a1e3f05c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29d3ffbf8025741dc2aea0ca4c0d51cfc999557055485fedd87f62405e6acbea +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0d2cc7d3fee3f0e9e012a9c9ecf5e4f090505ce --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a89246ebc83d5d2adc961696a5d1a031fc7eda0918c44d9cff7c0817c77ec437 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a5eeff4cb0c1fa1cc570410266fe6005a9cc4281 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adfb95e078c78d4fdd56bd42abb8bbbe58b6453e215db21f0fcb47713c177a02 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..78711295285824e3854fb481116294f6387364be --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8abe9d161ebbe0d87d7f353db092e8dc46bfd30e1254025025a542fbefbad8 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..59a8f5d6ea7ac671c9d3784f6278b11e58803908 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/trainer_state.json @@ -0,0 +1,874 @@ +{ + "best_metric": 1.8088148832321167, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 1194, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.525564652571853e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b836aa0c22049e4ee82137ada87f84096f5266f9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c30f96f9302bef1ec0aaf47e94804c44aad59092b23b0b27bf9d289790b4ea01 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7611ee290717765cb5f3ba797d67d71a5bd828e6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a2f2722cf78fd35891e94a61dc0066a4905b4f4e89f30f795bb9c292b9cac0 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..aea10fe5b63d67f140e481ad2ae84c92ca4bd7cf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b54fd81b3fab4b7b2ba804669407a398146ac0f686976ca2bdc7f43f958a3702 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..37d921733edcab27d0f19641bfc26267135acba0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aa5be69f5757671d6efa27aa0f43906b1c1bff09a566c3312fcf43509f2a8aa +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3d746a8fadc15f9e563077b7d7312344b39b305 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/trainer_state.json @@ -0,0 +1,1715 @@ +{ + "best_metric": 1.8061236143112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2388, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.4256260097026825, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1200 + }, + { + "epoch": 1.0134003350083751, + "grad_norm": 0.35050156712532043, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 1.0217755443886096, + "grad_norm": 0.34773948788642883, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 1220 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.35487470030784607, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.37040361762046814, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1240 + }, + { + "epoch": 1.0469011725293131, + "grad_norm": 0.33740508556365967, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1250 + }, + { + "epoch": 1.0552763819095476, + "grad_norm": 0.3962724506855011, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 1260 + }, + { + "epoch": 1.0636515912897822, + "grad_norm": 0.3129824101924896, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1270 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.3620055019855499, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1280 + }, + { + "epoch": 1.0804020100502512, + "grad_norm": 0.3480982184410095, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 1290 + }, + { + "epoch": 1.0887772194304857, + "grad_norm": 0.344424843788147, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1300 + }, + { + "epoch": 1.0971524288107202, + "grad_norm": 0.3480122685432434, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1310 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.323662132024765, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1320 + }, + { + "epoch": 1.1139028475711892, + "grad_norm": 0.35440102219581604, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1330 + }, + { + "epoch": 1.1222780569514237, + "grad_norm": 0.3342263698577881, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 1340 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.35705259442329407, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.38021907210350037, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1360 + }, + { + "epoch": 1.1474036850921272, + "grad_norm": 0.34918731451034546, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1370 + }, + { + "epoch": 1.1557788944723617, + "grad_norm": 0.371868371963501, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 1380 + }, + { + "epoch": 1.1641541038525962, + "grad_norm": 0.38413912057876587, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1390 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3898005187511444, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1400 + }, + { + "epoch": 1.1809045226130652, + "grad_norm": 0.3726498484611511, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 1410 + }, + { + "epoch": 1.1892797319932997, + "grad_norm": 0.3532905876636505, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 1420 + }, + { + "epoch": 1.1976549413735342, + "grad_norm": 0.338127464056015, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1430 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.3472749888896942, + "learning_rate": 0.0002, + "loss": 1.871, + "step": 1440 + }, + { + "epoch": 1.2144053601340032, + "grad_norm": 0.3523476719856262, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1450 + }, + { + "epoch": 1.2227805695142377, + "grad_norm": 0.42986124753952026, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 1460 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 0.38195517659187317, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 1470 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.31665122509002686, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 1480 + }, + { + "epoch": 1.2479061976549413, + "grad_norm": 0.3539541959762573, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 1490 + }, + { + "epoch": 1.2562814070351758, + "grad_norm": 0.40162816643714905, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1500 + }, + { + "epoch": 1.2646566164154103, + "grad_norm": 0.34727150201797485, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 1510 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3364993929862976, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1520 + }, + { + "epoch": 1.2814070351758793, + "grad_norm": 0.323483943939209, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 1530 + }, + { + "epoch": 1.2897822445561138, + "grad_norm": 0.4114733934402466, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 1540 + }, + { + "epoch": 1.2981574539363483, + "grad_norm": 0.37476620078086853, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1550 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.4216269552707672, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1560 + }, + { + "epoch": 1.3149078726968173, + "grad_norm": 0.3204927444458008, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1570 + }, + { + "epoch": 1.3232830820770518, + "grad_norm": 0.36916354298591614, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1580 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.3755691647529602, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 1590 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3688889443874359, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1600 + }, + { + "epoch": 1.3484087102177553, + "grad_norm": 0.34306398034095764, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 1610 + }, + { + "epoch": 1.3567839195979898, + "grad_norm": 0.3651525676250458, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 1620 + }, + { + "epoch": 1.3651591289782243, + "grad_norm": 0.3461526036262512, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1630 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37959185242652893, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1640 + }, + { + "epoch": 1.3819095477386933, + "grad_norm": 0.4005356431007385, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1650 + }, + { + "epoch": 1.3902847571189278, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 1660 + }, + { + "epoch": 1.3986599664991624, + "grad_norm": 0.38220855593681335, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 1670 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1680 + }, + { + "epoch": 1.4154103852596314, + "grad_norm": 0.40028059482574463, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 1690 + }, + { + "epoch": 1.4237855946398659, + "grad_norm": 0.3953610360622406, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 1700 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.39524543285369873, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1710 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.37721359729766846, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 1720 + }, + { + "epoch": 1.4489112227805694, + "grad_norm": 0.4220093786716461, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 1730 + }, + { + "epoch": 1.457286432160804, + "grad_norm": 0.3876369595527649, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 1740 + }, + { + "epoch": 1.4656616415410384, + "grad_norm": 0.3774619400501251, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1750 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.3608052432537079, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 1760 + }, + { + "epoch": 1.4824120603015074, + "grad_norm": 0.32083916664123535, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 1770 + }, + { + "epoch": 1.490787269681742, + "grad_norm": 0.32290884852409363, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1780 + }, + { + "epoch": 1.4991624790619764, + "grad_norm": 0.3537974953651428, + "learning_rate": 0.0002, + "loss": 1.7648, + "step": 1790 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.36576104164123535, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 1800 + }, + { + "epoch": 1.5159128978224454, + "grad_norm": 0.3336752653121948, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1810 + }, + { + "epoch": 1.52428810720268, + "grad_norm": 0.3551652431488037, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1820 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.43313586711883545, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 1830 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.39160311222076416, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 1840 + }, + { + "epoch": 1.5494137353433834, + "grad_norm": 0.38758179545402527, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1850 + }, + { + "epoch": 1.557788944723618, + "grad_norm": 0.3658832013607025, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 1860 + }, + { + "epoch": 1.5661641541038525, + "grad_norm": 0.375372052192688, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1870 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3586942255496979, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1880 + }, + { + "epoch": 1.5829145728643215, + "grad_norm": 0.3626467287540436, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1890 + }, + { + "epoch": 1.591289782244556, + "grad_norm": 0.4199363589286804, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1900 + }, + { + "epoch": 1.5996649916247905, + "grad_norm": 0.35646331310272217, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 1910 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3465106189250946, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1920 + }, + { + "epoch": 1.6164154103852595, + "grad_norm": 0.43392884731292725, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 1930 + }, + { + "epoch": 1.624790619765494, + "grad_norm": 0.39187198877334595, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 1940 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.3685080409049988, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1950 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.4044491946697235, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 1960 + }, + { + "epoch": 1.6499162479061975, + "grad_norm": 0.4388049244880676, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 1970 + }, + { + "epoch": 1.658291457286432, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 1980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3501148521900177, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1990 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3751881718635559, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2000 + }, + { + "epoch": 1.6834170854271355, + "grad_norm": 0.3902788460254669, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 2010 + }, + { + "epoch": 1.69179229480737, + "grad_norm": 0.39642134308815, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 1.7001675041876045, + "grad_norm": 0.35721203684806824, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 2030 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.360419899225235, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2040 + }, + { + "epoch": 1.7169179229480735, + "grad_norm": 0.3755600154399872, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 2050 + }, + { + "epoch": 1.725293132328308, + "grad_norm": 0.3939184844493866, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 2060 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.33955490589141846, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 2070 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.35501939058303833, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2080 + }, + { + "epoch": 1.7504187604690116, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 2090 + }, + { + "epoch": 1.758793969849246, + "grad_norm": 0.3472785949707031, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 2100 + }, + { + "epoch": 1.7671691792294806, + "grad_norm": 0.3620430827140808, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2110 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 2120 + }, + { + "epoch": 1.7839195979899496, + "grad_norm": 0.3662523925304413, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 2130 + }, + { + "epoch": 1.792294807370184, + "grad_norm": 0.4113886058330536, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2140 + }, + { + "epoch": 1.8006700167504186, + "grad_norm": 0.3765672743320465, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2150 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.41623714566230774, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 2160 + }, + { + "epoch": 1.8174204355108876, + "grad_norm": 0.3724099099636078, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2170 + }, + { + "epoch": 1.8257956448911221, + "grad_norm": 0.3990779221057892, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 2180 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.3677702844142914, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 2190 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.3944959342479706, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 2200 + }, + { + "epoch": 1.8509212730318256, + "grad_norm": 0.3413957357406616, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2210 + }, + { + "epoch": 1.8592964824120601, + "grad_norm": 0.40136098861694336, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 2220 + }, + { + "epoch": 1.8676716917922946, + "grad_norm": 0.3496319055557251, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 2230 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3759860694408417, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2240 + }, + { + "epoch": 1.8844221105527639, + "grad_norm": 0.43556007742881775, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 2250 + }, + { + "epoch": 1.8927973199329984, + "grad_norm": 0.3864828944206238, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 2260 + }, + { + "epoch": 1.9011725293132329, + "grad_norm": 0.396930456161499, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 2270 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37667879462242126, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 2280 + }, + { + "epoch": 1.917922948073702, + "grad_norm": 0.3539164066314697, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 2290 + }, + { + "epoch": 1.9262981574539364, + "grad_norm": 0.40542101860046387, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 2300 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.37341606616973877, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 2310 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.4011504352092743, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.95142378559464, + "grad_norm": 0.37934592366218567, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 2330 + }, + { + "epoch": 1.9597989949748744, + "grad_norm": 0.32745009660720825, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 2340 + }, + { + "epoch": 1.968174204355109, + "grad_norm": 0.38347750902175903, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2350 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3945120871067047, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 2360 + }, + { + "epoch": 1.984924623115578, + "grad_norm": 0.4034058749675751, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 2370 + }, + { + "epoch": 1.9932998324958124, + "grad_norm": 0.3546718955039978, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.8061236143112183, + "eval_runtime": 38.2113, + "eval_samples_per_second": 13.478, + "eval_steps_per_second": 1.701, + "step": 2388 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1051129305143706e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bca2d5737fd942994dc524ea9897388c518338d0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:259b21759a2130b543d8e5541d7a6e0d0853d6237f4796802a02696a4b106a2d +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e37ceecad2bcc8628877965327089afff47f3eb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de8759b786c1db146e692237ab5caa24fb747810e9949b2794145b8af72beeb6 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2aefe96f64ef3db668e45b9e7fd0ba3ca2b9e267 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e34926da297de597fc63351ccc35e8ad04c089f6e7bc1ff21e56bff148cf0e +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdecae0f5c6914af7a4a5460fe3255f37ec9a6fe --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1c485ec9117093fe9e08897a1e4e1356498391f3c460e6b93e1b0cf1c2ea33b +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f359246406fad41735773092f8aff347c681f58e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/trainer_state.json @@ -0,0 +1,2563 @@ +{ + "best_metric": 1.8061236143112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 3582, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.4256260097026825, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1200 + }, + { + "epoch": 1.0134003350083751, + "grad_norm": 0.35050156712532043, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 1.0217755443886096, + "grad_norm": 0.34773948788642883, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 1220 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.35487470030784607, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.37040361762046814, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1240 + }, + { + "epoch": 1.0469011725293131, + "grad_norm": 0.33740508556365967, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1250 + }, + { + "epoch": 1.0552763819095476, + "grad_norm": 0.3962724506855011, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 1260 + }, + { + "epoch": 1.0636515912897822, + "grad_norm": 0.3129824101924896, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1270 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.3620055019855499, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1280 + }, + { + "epoch": 1.0804020100502512, + "grad_norm": 0.3480982184410095, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 1290 + }, + { + "epoch": 1.0887772194304857, + "grad_norm": 0.344424843788147, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1300 + }, + { + "epoch": 1.0971524288107202, + "grad_norm": 0.3480122685432434, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1310 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.323662132024765, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1320 + }, + { + "epoch": 1.1139028475711892, + "grad_norm": 0.35440102219581604, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1330 + }, + { + "epoch": 1.1222780569514237, + "grad_norm": 0.3342263698577881, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 1340 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.35705259442329407, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.38021907210350037, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1360 + }, + { + "epoch": 1.1474036850921272, + "grad_norm": 0.34918731451034546, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1370 + }, + { + "epoch": 1.1557788944723617, + "grad_norm": 0.371868371963501, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 1380 + }, + { + "epoch": 1.1641541038525962, + "grad_norm": 0.38413912057876587, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1390 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3898005187511444, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1400 + }, + { + "epoch": 1.1809045226130652, + "grad_norm": 0.3726498484611511, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 1410 + }, + { + "epoch": 1.1892797319932997, + "grad_norm": 0.3532905876636505, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 1420 + }, + { + "epoch": 1.1976549413735342, + "grad_norm": 0.338127464056015, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1430 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.3472749888896942, + "learning_rate": 0.0002, + "loss": 1.871, + "step": 1440 + }, + { + "epoch": 1.2144053601340032, + "grad_norm": 0.3523476719856262, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1450 + }, + { + "epoch": 1.2227805695142377, + "grad_norm": 0.42986124753952026, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 1460 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 0.38195517659187317, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 1470 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.31665122509002686, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 1480 + }, + { + "epoch": 1.2479061976549413, + "grad_norm": 0.3539541959762573, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 1490 + }, + { + "epoch": 1.2562814070351758, + "grad_norm": 0.40162816643714905, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1500 + }, + { + "epoch": 1.2646566164154103, + "grad_norm": 0.34727150201797485, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 1510 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3364993929862976, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1520 + }, + { + "epoch": 1.2814070351758793, + "grad_norm": 0.323483943939209, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 1530 + }, + { + "epoch": 1.2897822445561138, + "grad_norm": 0.4114733934402466, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 1540 + }, + { + "epoch": 1.2981574539363483, + "grad_norm": 0.37476620078086853, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1550 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.4216269552707672, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1560 + }, + { + "epoch": 1.3149078726968173, + "grad_norm": 0.3204927444458008, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1570 + }, + { + "epoch": 1.3232830820770518, + "grad_norm": 0.36916354298591614, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1580 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.3755691647529602, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 1590 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3688889443874359, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1600 + }, + { + "epoch": 1.3484087102177553, + "grad_norm": 0.34306398034095764, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 1610 + }, + { + "epoch": 1.3567839195979898, + "grad_norm": 0.3651525676250458, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 1620 + }, + { + "epoch": 1.3651591289782243, + "grad_norm": 0.3461526036262512, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1630 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37959185242652893, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1640 + }, + { + "epoch": 1.3819095477386933, + "grad_norm": 0.4005356431007385, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1650 + }, + { + "epoch": 1.3902847571189278, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 1660 + }, + { + "epoch": 1.3986599664991624, + "grad_norm": 0.38220855593681335, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 1670 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1680 + }, + { + "epoch": 1.4154103852596314, + "grad_norm": 0.40028059482574463, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 1690 + }, + { + "epoch": 1.4237855946398659, + "grad_norm": 0.3953610360622406, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 1700 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.39524543285369873, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1710 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.37721359729766846, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 1720 + }, + { + "epoch": 1.4489112227805694, + "grad_norm": 0.4220093786716461, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 1730 + }, + { + "epoch": 1.457286432160804, + "grad_norm": 0.3876369595527649, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 1740 + }, + { + "epoch": 1.4656616415410384, + "grad_norm": 0.3774619400501251, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1750 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.3608052432537079, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 1760 + }, + { + "epoch": 1.4824120603015074, + "grad_norm": 0.32083916664123535, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 1770 + }, + { + "epoch": 1.490787269681742, + "grad_norm": 0.32290884852409363, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1780 + }, + { + "epoch": 1.4991624790619764, + "grad_norm": 0.3537974953651428, + "learning_rate": 0.0002, + "loss": 1.7648, + "step": 1790 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.36576104164123535, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 1800 + }, + { + "epoch": 1.5159128978224454, + "grad_norm": 0.3336752653121948, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1810 + }, + { + "epoch": 1.52428810720268, + "grad_norm": 0.3551652431488037, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1820 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.43313586711883545, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 1830 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.39160311222076416, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 1840 + }, + { + "epoch": 1.5494137353433834, + "grad_norm": 0.38758179545402527, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1850 + }, + { + "epoch": 1.557788944723618, + "grad_norm": 0.3658832013607025, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 1860 + }, + { + "epoch": 1.5661641541038525, + "grad_norm": 0.375372052192688, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1870 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3586942255496979, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1880 + }, + { + "epoch": 1.5829145728643215, + "grad_norm": 0.3626467287540436, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1890 + }, + { + "epoch": 1.591289782244556, + "grad_norm": 0.4199363589286804, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1900 + }, + { + "epoch": 1.5996649916247905, + "grad_norm": 0.35646331310272217, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 1910 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3465106189250946, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1920 + }, + { + "epoch": 1.6164154103852595, + "grad_norm": 0.43392884731292725, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 1930 + }, + { + "epoch": 1.624790619765494, + "grad_norm": 0.39187198877334595, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 1940 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.3685080409049988, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1950 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.4044491946697235, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 1960 + }, + { + "epoch": 1.6499162479061975, + "grad_norm": 0.4388049244880676, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 1970 + }, + { + "epoch": 1.658291457286432, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 1980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3501148521900177, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1990 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3751881718635559, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2000 + }, + { + "epoch": 1.6834170854271355, + "grad_norm": 0.3902788460254669, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 2010 + }, + { + "epoch": 1.69179229480737, + "grad_norm": 0.39642134308815, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 1.7001675041876045, + "grad_norm": 0.35721203684806824, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 2030 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.360419899225235, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2040 + }, + { + "epoch": 1.7169179229480735, + "grad_norm": 0.3755600154399872, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 2050 + }, + { + "epoch": 1.725293132328308, + "grad_norm": 0.3939184844493866, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 2060 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.33955490589141846, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 2070 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.35501939058303833, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2080 + }, + { + "epoch": 1.7504187604690116, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 2090 + }, + { + "epoch": 1.758793969849246, + "grad_norm": 0.3472785949707031, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 2100 + }, + { + "epoch": 1.7671691792294806, + "grad_norm": 0.3620430827140808, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2110 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 2120 + }, + { + "epoch": 1.7839195979899496, + "grad_norm": 0.3662523925304413, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 2130 + }, + { + "epoch": 1.792294807370184, + "grad_norm": 0.4113886058330536, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2140 + }, + { + "epoch": 1.8006700167504186, + "grad_norm": 0.3765672743320465, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2150 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.41623714566230774, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 2160 + }, + { + "epoch": 1.8174204355108876, + "grad_norm": 0.3724099099636078, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2170 + }, + { + "epoch": 1.8257956448911221, + "grad_norm": 0.3990779221057892, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 2180 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.3677702844142914, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 2190 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.3944959342479706, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 2200 + }, + { + "epoch": 1.8509212730318256, + "grad_norm": 0.3413957357406616, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2210 + }, + { + "epoch": 1.8592964824120601, + "grad_norm": 0.40136098861694336, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 2220 + }, + { + "epoch": 1.8676716917922946, + "grad_norm": 0.3496319055557251, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 2230 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3759860694408417, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2240 + }, + { + "epoch": 1.8844221105527639, + "grad_norm": 0.43556007742881775, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 2250 + }, + { + "epoch": 1.8927973199329984, + "grad_norm": 0.3864828944206238, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 2260 + }, + { + "epoch": 1.9011725293132329, + "grad_norm": 0.396930456161499, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 2270 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37667879462242126, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 2280 + }, + { + "epoch": 1.917922948073702, + "grad_norm": 0.3539164066314697, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 2290 + }, + { + "epoch": 1.9262981574539364, + "grad_norm": 0.40542101860046387, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 2300 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.37341606616973877, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 2310 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.4011504352092743, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.95142378559464, + "grad_norm": 0.37934592366218567, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 2330 + }, + { + "epoch": 1.9597989949748744, + "grad_norm": 0.32745009660720825, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 2340 + }, + { + "epoch": 1.968174204355109, + "grad_norm": 0.38347750902175903, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2350 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3945120871067047, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 2360 + }, + { + "epoch": 1.984924623115578, + "grad_norm": 0.4034058749675751, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 2370 + }, + { + "epoch": 1.9932998324958124, + "grad_norm": 0.3546718955039978, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.8061236143112183, + "eval_runtime": 38.2113, + "eval_samples_per_second": 13.478, + "eval_steps_per_second": 1.701, + "step": 2388 + }, + { + "epoch": 2.0016750418760467, + "grad_norm": 0.35184019804000854, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 2390 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.40416669845581055, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2400 + }, + { + "epoch": 2.0184254606365157, + "grad_norm": 0.3824569880962372, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2410 + }, + { + "epoch": 2.0268006700167502, + "grad_norm": 0.42036163806915283, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2420 + }, + { + "epoch": 2.0351758793969847, + "grad_norm": 0.40417996048927307, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2430 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.45298922061920166, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 2440 + }, + { + "epoch": 2.0519262981574538, + "grad_norm": 0.48289841413497925, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 2450 + }, + { + "epoch": 2.0603015075376883, + "grad_norm": 0.43702399730682373, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 2460 + }, + { + "epoch": 2.0686767169179228, + "grad_norm": 0.49487054347991943, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2470 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.40030500292778015, + "learning_rate": 0.0002, + "loss": 1.6552, + "step": 2480 + }, + { + "epoch": 2.0854271356783918, + "grad_norm": 0.4664880037307739, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 2490 + }, + { + "epoch": 2.0938023450586263, + "grad_norm": 0.4111400842666626, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 2500 + }, + { + "epoch": 2.102177554438861, + "grad_norm": 0.4155750572681427, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 2510 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.39257505536079407, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2520 + }, + { + "epoch": 2.11892797319933, + "grad_norm": 0.4156777560710907, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 2530 + }, + { + "epoch": 2.1273031825795643, + "grad_norm": 0.4025181233882904, + "learning_rate": 0.0002, + "loss": 1.6695, + "step": 2540 + }, + { + "epoch": 2.135678391959799, + "grad_norm": 0.42347562313079834, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2550 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.47068294882774353, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2560 + }, + { + "epoch": 2.152428810720268, + "grad_norm": 0.44081777334213257, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 2570 + }, + { + "epoch": 2.1608040201005023, + "grad_norm": 0.44823798537254333, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2580 + }, + { + "epoch": 2.169179229480737, + "grad_norm": 0.40486326813697815, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2590 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.454236775636673, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2600 + }, + { + "epoch": 2.185929648241206, + "grad_norm": 0.42555344104766846, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2610 + }, + { + "epoch": 2.1943048576214403, + "grad_norm": 0.5607381463050842, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2620 + }, + { + "epoch": 2.202680067001675, + "grad_norm": 0.4095611870288849, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 2630 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.419342577457428, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2640 + }, + { + "epoch": 2.219430485762144, + "grad_norm": 0.48541849851608276, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2650 + }, + { + "epoch": 2.2278056951423784, + "grad_norm": 0.4365246891975403, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 2660 + }, + { + "epoch": 2.236180904522613, + "grad_norm": 0.46417000889778137, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 2670 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5034580230712891, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 2680 + }, + { + "epoch": 2.2529313232830823, + "grad_norm": 0.44852879643440247, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2690 + }, + { + "epoch": 2.2613065326633164, + "grad_norm": 0.43886998295783997, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 2700 + }, + { + "epoch": 2.2696817420435513, + "grad_norm": 0.45762625336647034, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 2710 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.39429017901420593, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2720 + }, + { + "epoch": 2.2864321608040203, + "grad_norm": 0.4420442581176758, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2730 + }, + { + "epoch": 2.2948073701842544, + "grad_norm": 0.4327794015407562, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 2740 + }, + { + "epoch": 2.3031825795644894, + "grad_norm": 0.4303780198097229, + "learning_rate": 0.0002, + "loss": 1.6405, + "step": 2750 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.41379377245903015, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 2760 + }, + { + "epoch": 2.3199329983249584, + "grad_norm": 0.4821205735206604, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 2770 + }, + { + "epoch": 2.3283082077051924, + "grad_norm": 0.46232181787490845, + "learning_rate": 0.0002, + "loss": 1.6694, + "step": 2780 + }, + { + "epoch": 2.3366834170854274, + "grad_norm": 0.44937554001808167, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 2790 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.443250447511673, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2800 + }, + { + "epoch": 2.3534338358458964, + "grad_norm": 0.4687805473804474, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2810 + }, + { + "epoch": 2.3618090452261304, + "grad_norm": 0.435031920671463, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2820 + }, + { + "epoch": 2.3701842546063654, + "grad_norm": 0.4949858784675598, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 2830 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.46349018812179565, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 2840 + }, + { + "epoch": 2.3869346733668344, + "grad_norm": 0.46377238631248474, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2850 + }, + { + "epoch": 2.3953098827470685, + "grad_norm": 0.6111940741539001, + "learning_rate": 0.0002, + "loss": 1.5384, + "step": 2860 + }, + { + "epoch": 2.4036850921273034, + "grad_norm": 0.45090532302856445, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2870 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.4762120842933655, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2880 + }, + { + "epoch": 2.4204355108877724, + "grad_norm": 0.4397919774055481, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 2890 + }, + { + "epoch": 2.4288107202680065, + "grad_norm": 0.4765152335166931, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2900 + }, + { + "epoch": 2.4371859296482414, + "grad_norm": 0.4347304403781891, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2910 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.3918324410915375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 2920 + }, + { + "epoch": 2.4539363484087104, + "grad_norm": 0.43932855129241943, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 2930 + }, + { + "epoch": 2.4623115577889445, + "grad_norm": 0.46946918964385986, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 2940 + }, + { + "epoch": 2.4706867671691795, + "grad_norm": 0.45169174671173096, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2950 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.43488186597824097, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2960 + }, + { + "epoch": 2.4874371859296485, + "grad_norm": 0.42297765612602234, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2970 + }, + { + "epoch": 2.4958123953098825, + "grad_norm": 0.4546392560005188, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2980 + }, + { + "epoch": 2.5041876046901175, + "grad_norm": 0.4236692488193512, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 2990 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.46421024203300476, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 3000 + }, + { + "epoch": 2.5209380234505865, + "grad_norm": 0.5040220618247986, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 3010 + }, + { + "epoch": 2.5293132328308205, + "grad_norm": 0.4596138894557953, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.5376884422110555, + "grad_norm": 0.4410228729248047, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3030 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.553693413734436, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3040 + }, + { + "epoch": 2.5544388609715245, + "grad_norm": 0.41298043727874756, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 3050 + }, + { + "epoch": 2.5628140703517586, + "grad_norm": 0.4894513487815857, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 3060 + }, + { + "epoch": 2.5711892797319935, + "grad_norm": 0.5525603294372559, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3070 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.5043630003929138, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3080 + }, + { + "epoch": 2.5879396984924625, + "grad_norm": 0.4690920412540436, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 3090 + }, + { + "epoch": 2.5963149078726966, + "grad_norm": 0.4358677566051483, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 3100 + }, + { + "epoch": 2.6046901172529315, + "grad_norm": 0.4621894061565399, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 3110 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.4639507532119751, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 3120 + }, + { + "epoch": 2.6214405360134005, + "grad_norm": 0.45161309838294983, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 3130 + }, + { + "epoch": 2.6298157453936346, + "grad_norm": 0.49179261922836304, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 3140 + }, + { + "epoch": 2.6381909547738696, + "grad_norm": 0.4739720821380615, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 3150 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.468252956867218, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3160 + }, + { + "epoch": 2.6549413735343386, + "grad_norm": 0.44691553711891174, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 3170 + }, + { + "epoch": 2.6633165829145726, + "grad_norm": 0.47537046670913696, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 3180 + }, + { + "epoch": 2.6716917922948076, + "grad_norm": 0.4445202052593231, + "learning_rate": 0.0002, + "loss": 1.6755, + "step": 3190 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.46785518527030945, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3200 + }, + { + "epoch": 2.6884422110552766, + "grad_norm": 0.4807088077068329, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 3210 + }, + { + "epoch": 2.6968174204355106, + "grad_norm": 0.4547516703605652, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 3220 + }, + { + "epoch": 2.7051926298157456, + "grad_norm": 0.5200821161270142, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 3230 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.4915551245212555, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3240 + }, + { + "epoch": 2.7219430485762146, + "grad_norm": 0.4324817955493927, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3250 + }, + { + "epoch": 2.7303182579564487, + "grad_norm": 0.6290464997291565, + "learning_rate": 0.0002, + "loss": 1.6154, + "step": 3260 + }, + { + "epoch": 2.7386934673366836, + "grad_norm": 0.42255541682243347, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 3270 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.47089505195617676, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 3280 + }, + { + "epoch": 2.7554438860971526, + "grad_norm": 0.4492960572242737, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3290 + }, + { + "epoch": 2.7638190954773867, + "grad_norm": 0.4711938202381134, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3300 + }, + { + "epoch": 2.7721943048576216, + "grad_norm": 0.4635316729545593, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 3310 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.4207742512226105, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3320 + }, + { + "epoch": 2.7889447236180906, + "grad_norm": 0.5545504093170166, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3330 + }, + { + "epoch": 2.7973199329983247, + "grad_norm": 0.46976953744888306, + "learning_rate": 0.0002, + "loss": 1.6642, + "step": 3340 + }, + { + "epoch": 2.8056951423785597, + "grad_norm": 0.4805937111377716, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 3350 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.4986467659473419, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 3360 + }, + { + "epoch": 2.8224455611390287, + "grad_norm": 0.44702932238578796, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 3370 + }, + { + "epoch": 2.8308207705192627, + "grad_norm": 0.4698854088783264, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 3380 + }, + { + "epoch": 2.8391959798994977, + "grad_norm": 0.5756528377532959, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 3390 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.4266531765460968, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 3400 + }, + { + "epoch": 2.8559463986599667, + "grad_norm": 0.5342442989349365, + "learning_rate": 0.0002, + "loss": 1.6351, + "step": 3410 + }, + { + "epoch": 2.8643216080402008, + "grad_norm": 0.47210443019866943, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 3420 + }, + { + "epoch": 2.8726968174204357, + "grad_norm": 0.4491795599460602, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3430 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5387647151947021, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 3440 + }, + { + "epoch": 2.8894472361809047, + "grad_norm": 0.5059208273887634, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3450 + }, + { + "epoch": 2.8978224455611388, + "grad_norm": 0.472605437040329, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3460 + }, + { + "epoch": 2.9061976549413737, + "grad_norm": 0.499795138835907, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3470 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.4887969493865967, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 3480 + }, + { + "epoch": 2.9229480737018427, + "grad_norm": 0.4670022130012512, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.931323283082077, + "grad_norm": 0.4475444555282593, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 3500 + }, + { + "epoch": 2.9396984924623117, + "grad_norm": 0.39244669675827026, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 3510 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4905056059360504, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3520 + }, + { + "epoch": 2.9564489112227808, + "grad_norm": 0.4395551085472107, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 3530 + }, + { + "epoch": 2.964824120603015, + "grad_norm": 0.4693661034107208, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3540 + }, + { + "epoch": 2.9731993299832498, + "grad_norm": 0.473781943321228, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 3550 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4374050796031952, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3560 + }, + { + "epoch": 2.9899497487437188, + "grad_norm": 0.46144190430641174, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 3570 + }, + { + "epoch": 2.998324958123953, + "grad_norm": 0.43887680768966675, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 3580 + }, + { + "epoch": 3.0, + "eval_loss": 1.8283122777938843, + "eval_runtime": 38.023, + "eval_samples_per_second": 13.544, + "eval_steps_per_second": 1.709, + "step": 3582 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.657669395771556e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-3582/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5cba73e531f58f4a3cc428e3f5d84465c68515bf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062585134be8cf81fd37b315d77e165d9899d6aa71f48cc5578832f447e09ef3 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..24a5d4d365808c8f2ea652bf105ae56a273f05d6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c616b385484b1f897ce52e9d33c1e9943b0c29973d1b16aeaf3e7075a9f9742e +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..36c97f33ebe4e728ac9b8e4038333fd3b90336f0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46cae93becb90ef77d43ea5352ea11ec36ab0cc3ba9368f98b71e01054204797 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38e539005d2ed57a9081fc406809ee3db3927318 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abddd4cb5f006a0dc83dc4d7301cd847cdc04f5d4b4af440089fdfbd8dec415b +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c6e737a07e786b7b264c81b389a4f9aa63e02ac --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/trainer_state.json @@ -0,0 +1,3404 @@ +{ + "best_metric": 1.8061236143112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 4776, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.4256260097026825, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1200 + }, + { + "epoch": 1.0134003350083751, + "grad_norm": 0.35050156712532043, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 1.0217755443886096, + "grad_norm": 0.34773948788642883, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 1220 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.35487470030784607, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.37040361762046814, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1240 + }, + { + "epoch": 1.0469011725293131, + "grad_norm": 0.33740508556365967, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1250 + }, + { + "epoch": 1.0552763819095476, + "grad_norm": 0.3962724506855011, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 1260 + }, + { + "epoch": 1.0636515912897822, + "grad_norm": 0.3129824101924896, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1270 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.3620055019855499, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1280 + }, + { + "epoch": 1.0804020100502512, + "grad_norm": 0.3480982184410095, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 1290 + }, + { + "epoch": 1.0887772194304857, + "grad_norm": 0.344424843788147, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1300 + }, + { + "epoch": 1.0971524288107202, + "grad_norm": 0.3480122685432434, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1310 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.323662132024765, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1320 + }, + { + "epoch": 1.1139028475711892, + "grad_norm": 0.35440102219581604, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1330 + }, + { + "epoch": 1.1222780569514237, + "grad_norm": 0.3342263698577881, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 1340 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.35705259442329407, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.38021907210350037, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1360 + }, + { + "epoch": 1.1474036850921272, + "grad_norm": 0.34918731451034546, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1370 + }, + { + "epoch": 1.1557788944723617, + "grad_norm": 0.371868371963501, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 1380 + }, + { + "epoch": 1.1641541038525962, + "grad_norm": 0.38413912057876587, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1390 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3898005187511444, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1400 + }, + { + "epoch": 1.1809045226130652, + "grad_norm": 0.3726498484611511, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 1410 + }, + { + "epoch": 1.1892797319932997, + "grad_norm": 0.3532905876636505, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 1420 + }, + { + "epoch": 1.1976549413735342, + "grad_norm": 0.338127464056015, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1430 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.3472749888896942, + "learning_rate": 0.0002, + "loss": 1.871, + "step": 1440 + }, + { + "epoch": 1.2144053601340032, + "grad_norm": 0.3523476719856262, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1450 + }, + { + "epoch": 1.2227805695142377, + "grad_norm": 0.42986124753952026, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 1460 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 0.38195517659187317, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 1470 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.31665122509002686, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 1480 + }, + { + "epoch": 1.2479061976549413, + "grad_norm": 0.3539541959762573, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 1490 + }, + { + "epoch": 1.2562814070351758, + "grad_norm": 0.40162816643714905, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1500 + }, + { + "epoch": 1.2646566164154103, + "grad_norm": 0.34727150201797485, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 1510 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3364993929862976, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1520 + }, + { + "epoch": 1.2814070351758793, + "grad_norm": 0.323483943939209, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 1530 + }, + { + "epoch": 1.2897822445561138, + "grad_norm": 0.4114733934402466, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 1540 + }, + { + "epoch": 1.2981574539363483, + "grad_norm": 0.37476620078086853, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1550 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.4216269552707672, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1560 + }, + { + "epoch": 1.3149078726968173, + "grad_norm": 0.3204927444458008, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1570 + }, + { + "epoch": 1.3232830820770518, + "grad_norm": 0.36916354298591614, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1580 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.3755691647529602, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 1590 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3688889443874359, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1600 + }, + { + "epoch": 1.3484087102177553, + "grad_norm": 0.34306398034095764, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 1610 + }, + { + "epoch": 1.3567839195979898, + "grad_norm": 0.3651525676250458, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 1620 + }, + { + "epoch": 1.3651591289782243, + "grad_norm": 0.3461526036262512, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1630 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37959185242652893, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1640 + }, + { + "epoch": 1.3819095477386933, + "grad_norm": 0.4005356431007385, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1650 + }, + { + "epoch": 1.3902847571189278, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 1660 + }, + { + "epoch": 1.3986599664991624, + "grad_norm": 0.38220855593681335, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 1670 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1680 + }, + { + "epoch": 1.4154103852596314, + "grad_norm": 0.40028059482574463, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 1690 + }, + { + "epoch": 1.4237855946398659, + "grad_norm": 0.3953610360622406, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 1700 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.39524543285369873, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1710 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.37721359729766846, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 1720 + }, + { + "epoch": 1.4489112227805694, + "grad_norm": 0.4220093786716461, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 1730 + }, + { + "epoch": 1.457286432160804, + "grad_norm": 0.3876369595527649, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 1740 + }, + { + "epoch": 1.4656616415410384, + "grad_norm": 0.3774619400501251, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1750 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.3608052432537079, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 1760 + }, + { + "epoch": 1.4824120603015074, + "grad_norm": 0.32083916664123535, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 1770 + }, + { + "epoch": 1.490787269681742, + "grad_norm": 0.32290884852409363, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1780 + }, + { + "epoch": 1.4991624790619764, + "grad_norm": 0.3537974953651428, + "learning_rate": 0.0002, + "loss": 1.7648, + "step": 1790 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.36576104164123535, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 1800 + }, + { + "epoch": 1.5159128978224454, + "grad_norm": 0.3336752653121948, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1810 + }, + { + "epoch": 1.52428810720268, + "grad_norm": 0.3551652431488037, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1820 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.43313586711883545, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 1830 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.39160311222076416, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 1840 + }, + { + "epoch": 1.5494137353433834, + "grad_norm": 0.38758179545402527, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1850 + }, + { + "epoch": 1.557788944723618, + "grad_norm": 0.3658832013607025, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 1860 + }, + { + "epoch": 1.5661641541038525, + "grad_norm": 0.375372052192688, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1870 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3586942255496979, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1880 + }, + { + "epoch": 1.5829145728643215, + "grad_norm": 0.3626467287540436, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1890 + }, + { + "epoch": 1.591289782244556, + "grad_norm": 0.4199363589286804, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1900 + }, + { + "epoch": 1.5996649916247905, + "grad_norm": 0.35646331310272217, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 1910 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3465106189250946, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1920 + }, + { + "epoch": 1.6164154103852595, + "grad_norm": 0.43392884731292725, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 1930 + }, + { + "epoch": 1.624790619765494, + "grad_norm": 0.39187198877334595, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 1940 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.3685080409049988, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1950 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.4044491946697235, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 1960 + }, + { + "epoch": 1.6499162479061975, + "grad_norm": 0.4388049244880676, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 1970 + }, + { + "epoch": 1.658291457286432, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 1980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3501148521900177, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1990 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3751881718635559, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2000 + }, + { + "epoch": 1.6834170854271355, + "grad_norm": 0.3902788460254669, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 2010 + }, + { + "epoch": 1.69179229480737, + "grad_norm": 0.39642134308815, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 1.7001675041876045, + "grad_norm": 0.35721203684806824, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 2030 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.360419899225235, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2040 + }, + { + "epoch": 1.7169179229480735, + "grad_norm": 0.3755600154399872, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 2050 + }, + { + "epoch": 1.725293132328308, + "grad_norm": 0.3939184844493866, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 2060 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.33955490589141846, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 2070 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.35501939058303833, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2080 + }, + { + "epoch": 1.7504187604690116, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 2090 + }, + { + "epoch": 1.758793969849246, + "grad_norm": 0.3472785949707031, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 2100 + }, + { + "epoch": 1.7671691792294806, + "grad_norm": 0.3620430827140808, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2110 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 2120 + }, + { + "epoch": 1.7839195979899496, + "grad_norm": 0.3662523925304413, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 2130 + }, + { + "epoch": 1.792294807370184, + "grad_norm": 0.4113886058330536, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2140 + }, + { + "epoch": 1.8006700167504186, + "grad_norm": 0.3765672743320465, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2150 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.41623714566230774, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 2160 + }, + { + "epoch": 1.8174204355108876, + "grad_norm": 0.3724099099636078, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2170 + }, + { + "epoch": 1.8257956448911221, + "grad_norm": 0.3990779221057892, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 2180 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.3677702844142914, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 2190 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.3944959342479706, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 2200 + }, + { + "epoch": 1.8509212730318256, + "grad_norm": 0.3413957357406616, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2210 + }, + { + "epoch": 1.8592964824120601, + "grad_norm": 0.40136098861694336, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 2220 + }, + { + "epoch": 1.8676716917922946, + "grad_norm": 0.3496319055557251, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 2230 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3759860694408417, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2240 + }, + { + "epoch": 1.8844221105527639, + "grad_norm": 0.43556007742881775, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 2250 + }, + { + "epoch": 1.8927973199329984, + "grad_norm": 0.3864828944206238, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 2260 + }, + { + "epoch": 1.9011725293132329, + "grad_norm": 0.396930456161499, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 2270 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37667879462242126, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 2280 + }, + { + "epoch": 1.917922948073702, + "grad_norm": 0.3539164066314697, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 2290 + }, + { + "epoch": 1.9262981574539364, + "grad_norm": 0.40542101860046387, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 2300 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.37341606616973877, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 2310 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.4011504352092743, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.95142378559464, + "grad_norm": 0.37934592366218567, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 2330 + }, + { + "epoch": 1.9597989949748744, + "grad_norm": 0.32745009660720825, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 2340 + }, + { + "epoch": 1.968174204355109, + "grad_norm": 0.38347750902175903, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2350 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3945120871067047, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 2360 + }, + { + "epoch": 1.984924623115578, + "grad_norm": 0.4034058749675751, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 2370 + }, + { + "epoch": 1.9932998324958124, + "grad_norm": 0.3546718955039978, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.8061236143112183, + "eval_runtime": 38.2113, + "eval_samples_per_second": 13.478, + "eval_steps_per_second": 1.701, + "step": 2388 + }, + { + "epoch": 2.0016750418760467, + "grad_norm": 0.35184019804000854, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 2390 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.40416669845581055, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2400 + }, + { + "epoch": 2.0184254606365157, + "grad_norm": 0.3824569880962372, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2410 + }, + { + "epoch": 2.0268006700167502, + "grad_norm": 0.42036163806915283, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2420 + }, + { + "epoch": 2.0351758793969847, + "grad_norm": 0.40417996048927307, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2430 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.45298922061920166, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 2440 + }, + { + "epoch": 2.0519262981574538, + "grad_norm": 0.48289841413497925, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 2450 + }, + { + "epoch": 2.0603015075376883, + "grad_norm": 0.43702399730682373, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 2460 + }, + { + "epoch": 2.0686767169179228, + "grad_norm": 0.49487054347991943, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2470 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.40030500292778015, + "learning_rate": 0.0002, + "loss": 1.6552, + "step": 2480 + }, + { + "epoch": 2.0854271356783918, + "grad_norm": 0.4664880037307739, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 2490 + }, + { + "epoch": 2.0938023450586263, + "grad_norm": 0.4111400842666626, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 2500 + }, + { + "epoch": 2.102177554438861, + "grad_norm": 0.4155750572681427, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 2510 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.39257505536079407, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2520 + }, + { + "epoch": 2.11892797319933, + "grad_norm": 0.4156777560710907, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 2530 + }, + { + "epoch": 2.1273031825795643, + "grad_norm": 0.4025181233882904, + "learning_rate": 0.0002, + "loss": 1.6695, + "step": 2540 + }, + { + "epoch": 2.135678391959799, + "grad_norm": 0.42347562313079834, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2550 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.47068294882774353, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2560 + }, + { + "epoch": 2.152428810720268, + "grad_norm": 0.44081777334213257, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 2570 + }, + { + "epoch": 2.1608040201005023, + "grad_norm": 0.44823798537254333, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2580 + }, + { + "epoch": 2.169179229480737, + "grad_norm": 0.40486326813697815, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2590 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.454236775636673, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2600 + }, + { + "epoch": 2.185929648241206, + "grad_norm": 0.42555344104766846, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2610 + }, + { + "epoch": 2.1943048576214403, + "grad_norm": 0.5607381463050842, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2620 + }, + { + "epoch": 2.202680067001675, + "grad_norm": 0.4095611870288849, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 2630 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.419342577457428, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2640 + }, + { + "epoch": 2.219430485762144, + "grad_norm": 0.48541849851608276, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2650 + }, + { + "epoch": 2.2278056951423784, + "grad_norm": 0.4365246891975403, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 2660 + }, + { + "epoch": 2.236180904522613, + "grad_norm": 0.46417000889778137, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 2670 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5034580230712891, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 2680 + }, + { + "epoch": 2.2529313232830823, + "grad_norm": 0.44852879643440247, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2690 + }, + { + "epoch": 2.2613065326633164, + "grad_norm": 0.43886998295783997, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 2700 + }, + { + "epoch": 2.2696817420435513, + "grad_norm": 0.45762625336647034, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 2710 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.39429017901420593, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2720 + }, + { + "epoch": 2.2864321608040203, + "grad_norm": 0.4420442581176758, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2730 + }, + { + "epoch": 2.2948073701842544, + "grad_norm": 0.4327794015407562, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 2740 + }, + { + "epoch": 2.3031825795644894, + "grad_norm": 0.4303780198097229, + "learning_rate": 0.0002, + "loss": 1.6405, + "step": 2750 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.41379377245903015, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 2760 + }, + { + "epoch": 2.3199329983249584, + "grad_norm": 0.4821205735206604, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 2770 + }, + { + "epoch": 2.3283082077051924, + "grad_norm": 0.46232181787490845, + "learning_rate": 0.0002, + "loss": 1.6694, + "step": 2780 + }, + { + "epoch": 2.3366834170854274, + "grad_norm": 0.44937554001808167, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 2790 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.443250447511673, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2800 + }, + { + "epoch": 2.3534338358458964, + "grad_norm": 0.4687805473804474, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2810 + }, + { + "epoch": 2.3618090452261304, + "grad_norm": 0.435031920671463, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2820 + }, + { + "epoch": 2.3701842546063654, + "grad_norm": 0.4949858784675598, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 2830 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.46349018812179565, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 2840 + }, + { + "epoch": 2.3869346733668344, + "grad_norm": 0.46377238631248474, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2850 + }, + { + "epoch": 2.3953098827470685, + "grad_norm": 0.6111940741539001, + "learning_rate": 0.0002, + "loss": 1.5384, + "step": 2860 + }, + { + "epoch": 2.4036850921273034, + "grad_norm": 0.45090532302856445, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2870 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.4762120842933655, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2880 + }, + { + "epoch": 2.4204355108877724, + "grad_norm": 0.4397919774055481, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 2890 + }, + { + "epoch": 2.4288107202680065, + "grad_norm": 0.4765152335166931, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2900 + }, + { + "epoch": 2.4371859296482414, + "grad_norm": 0.4347304403781891, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2910 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.3918324410915375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 2920 + }, + { + "epoch": 2.4539363484087104, + "grad_norm": 0.43932855129241943, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 2930 + }, + { + "epoch": 2.4623115577889445, + "grad_norm": 0.46946918964385986, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 2940 + }, + { + "epoch": 2.4706867671691795, + "grad_norm": 0.45169174671173096, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2950 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.43488186597824097, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2960 + }, + { + "epoch": 2.4874371859296485, + "grad_norm": 0.42297765612602234, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2970 + }, + { + "epoch": 2.4958123953098825, + "grad_norm": 0.4546392560005188, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2980 + }, + { + "epoch": 2.5041876046901175, + "grad_norm": 0.4236692488193512, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 2990 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.46421024203300476, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 3000 + }, + { + "epoch": 2.5209380234505865, + "grad_norm": 0.5040220618247986, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 3010 + }, + { + "epoch": 2.5293132328308205, + "grad_norm": 0.4596138894557953, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.5376884422110555, + "grad_norm": 0.4410228729248047, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3030 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.553693413734436, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3040 + }, + { + "epoch": 2.5544388609715245, + "grad_norm": 0.41298043727874756, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 3050 + }, + { + "epoch": 2.5628140703517586, + "grad_norm": 0.4894513487815857, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 3060 + }, + { + "epoch": 2.5711892797319935, + "grad_norm": 0.5525603294372559, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3070 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.5043630003929138, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3080 + }, + { + "epoch": 2.5879396984924625, + "grad_norm": 0.4690920412540436, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 3090 + }, + { + "epoch": 2.5963149078726966, + "grad_norm": 0.4358677566051483, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 3100 + }, + { + "epoch": 2.6046901172529315, + "grad_norm": 0.4621894061565399, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 3110 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.4639507532119751, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 3120 + }, + { + "epoch": 2.6214405360134005, + "grad_norm": 0.45161309838294983, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 3130 + }, + { + "epoch": 2.6298157453936346, + "grad_norm": 0.49179261922836304, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 3140 + }, + { + "epoch": 2.6381909547738696, + "grad_norm": 0.4739720821380615, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 3150 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.468252956867218, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3160 + }, + { + "epoch": 2.6549413735343386, + "grad_norm": 0.44691553711891174, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 3170 + }, + { + "epoch": 2.6633165829145726, + "grad_norm": 0.47537046670913696, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 3180 + }, + { + "epoch": 2.6716917922948076, + "grad_norm": 0.4445202052593231, + "learning_rate": 0.0002, + "loss": 1.6755, + "step": 3190 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.46785518527030945, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3200 + }, + { + "epoch": 2.6884422110552766, + "grad_norm": 0.4807088077068329, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 3210 + }, + { + "epoch": 2.6968174204355106, + "grad_norm": 0.4547516703605652, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 3220 + }, + { + "epoch": 2.7051926298157456, + "grad_norm": 0.5200821161270142, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 3230 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.4915551245212555, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3240 + }, + { + "epoch": 2.7219430485762146, + "grad_norm": 0.4324817955493927, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3250 + }, + { + "epoch": 2.7303182579564487, + "grad_norm": 0.6290464997291565, + "learning_rate": 0.0002, + "loss": 1.6154, + "step": 3260 + }, + { + "epoch": 2.7386934673366836, + "grad_norm": 0.42255541682243347, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 3270 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.47089505195617676, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 3280 + }, + { + "epoch": 2.7554438860971526, + "grad_norm": 0.4492960572242737, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3290 + }, + { + "epoch": 2.7638190954773867, + "grad_norm": 0.4711938202381134, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3300 + }, + { + "epoch": 2.7721943048576216, + "grad_norm": 0.4635316729545593, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 3310 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.4207742512226105, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3320 + }, + { + "epoch": 2.7889447236180906, + "grad_norm": 0.5545504093170166, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3330 + }, + { + "epoch": 2.7973199329983247, + "grad_norm": 0.46976953744888306, + "learning_rate": 0.0002, + "loss": 1.6642, + "step": 3340 + }, + { + "epoch": 2.8056951423785597, + "grad_norm": 0.4805937111377716, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 3350 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.4986467659473419, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 3360 + }, + { + "epoch": 2.8224455611390287, + "grad_norm": 0.44702932238578796, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 3370 + }, + { + "epoch": 2.8308207705192627, + "grad_norm": 0.4698854088783264, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 3380 + }, + { + "epoch": 2.8391959798994977, + "grad_norm": 0.5756528377532959, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 3390 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.4266531765460968, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 3400 + }, + { + "epoch": 2.8559463986599667, + "grad_norm": 0.5342442989349365, + "learning_rate": 0.0002, + "loss": 1.6351, + "step": 3410 + }, + { + "epoch": 2.8643216080402008, + "grad_norm": 0.47210443019866943, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 3420 + }, + { + "epoch": 2.8726968174204357, + "grad_norm": 0.4491795599460602, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3430 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5387647151947021, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 3440 + }, + { + "epoch": 2.8894472361809047, + "grad_norm": 0.5059208273887634, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3450 + }, + { + "epoch": 2.8978224455611388, + "grad_norm": 0.472605437040329, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3460 + }, + { + "epoch": 2.9061976549413737, + "grad_norm": 0.499795138835907, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3470 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.4887969493865967, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 3480 + }, + { + "epoch": 2.9229480737018427, + "grad_norm": 0.4670022130012512, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.931323283082077, + "grad_norm": 0.4475444555282593, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 3500 + }, + { + "epoch": 2.9396984924623117, + "grad_norm": 0.39244669675827026, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 3510 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4905056059360504, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3520 + }, + { + "epoch": 2.9564489112227808, + "grad_norm": 0.4395551085472107, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 3530 + }, + { + "epoch": 2.964824120603015, + "grad_norm": 0.4693661034107208, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3540 + }, + { + "epoch": 2.9731993299832498, + "grad_norm": 0.473781943321228, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 3550 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4374050796031952, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3560 + }, + { + "epoch": 2.9899497487437188, + "grad_norm": 0.46144190430641174, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 3570 + }, + { + "epoch": 2.998324958123953, + "grad_norm": 0.43887680768966675, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 3580 + }, + { + "epoch": 3.0, + "eval_loss": 1.8283122777938843, + "eval_runtime": 38.023, + "eval_samples_per_second": 13.544, + "eval_steps_per_second": 1.709, + "step": 3582 + }, + { + "epoch": 3.006700167504188, + "grad_norm": 0.6784713268280029, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3590 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5783940553665161, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 3600 + }, + { + "epoch": 3.023450586264657, + "grad_norm": 0.5408937335014343, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 3610 + }, + { + "epoch": 3.0318257956448913, + "grad_norm": 0.5229013562202454, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3620 + }, + { + "epoch": 3.040201005025126, + "grad_norm": 0.49160143733024597, + "learning_rate": 0.0002, + "loss": 1.4835, + "step": 3630 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.6563201546669006, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 3640 + }, + { + "epoch": 3.056951423785595, + "grad_norm": 0.5686020851135254, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3650 + }, + { + "epoch": 3.0653266331658293, + "grad_norm": 0.5774043202400208, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3660 + }, + { + "epoch": 3.073701842546064, + "grad_norm": 0.6106171011924744, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 3670 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.517433226108551, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3680 + }, + { + "epoch": 3.090452261306533, + "grad_norm": 0.5681702494621277, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3690 + }, + { + "epoch": 3.0988274706867673, + "grad_norm": 0.5769233107566833, + "learning_rate": 0.0002, + "loss": 1.4731, + "step": 3700 + }, + { + "epoch": 3.107202680067002, + "grad_norm": 0.5657462477684021, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 3710 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.6035246253013611, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3720 + }, + { + "epoch": 3.123953098827471, + "grad_norm": 0.7286643385887146, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 3730 + }, + { + "epoch": 3.1323283082077054, + "grad_norm": 0.5121201872825623, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 3740 + }, + { + "epoch": 3.14070351758794, + "grad_norm": 0.5074213147163391, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 3750 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.57481849193573, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 3760 + }, + { + "epoch": 3.157453936348409, + "grad_norm": 0.6326663494110107, + "learning_rate": 0.0002, + "loss": 1.4765, + "step": 3770 + }, + { + "epoch": 3.1658291457286434, + "grad_norm": 0.6039315462112427, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3780 + }, + { + "epoch": 3.174204355108878, + "grad_norm": 0.6936715245246887, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 3790 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.6516796946525574, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3800 + }, + { + "epoch": 3.190954773869347, + "grad_norm": 0.6140730977058411, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3810 + }, + { + "epoch": 3.1993299832495814, + "grad_norm": 0.631328284740448, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3820 + }, + { + "epoch": 3.207705192629816, + "grad_norm": 0.6265402436256409, + "learning_rate": 0.0002, + "loss": 1.4844, + "step": 3830 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6649428606033325, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3840 + }, + { + "epoch": 3.224455611390285, + "grad_norm": 0.5329259634017944, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 3850 + }, + { + "epoch": 3.2328308207705194, + "grad_norm": 0.6008304953575134, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 3860 + }, + { + "epoch": 3.241206030150754, + "grad_norm": 0.5918582081794739, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 3870 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.643622100353241, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3880 + }, + { + "epoch": 3.257956448911223, + "grad_norm": 0.5517964363098145, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 3890 + }, + { + "epoch": 3.2663316582914574, + "grad_norm": 0.6780755519866943, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 3900 + }, + { + "epoch": 3.274706867671692, + "grad_norm": 0.6742202639579773, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3910 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.6228749752044678, + "learning_rate": 0.0002, + "loss": 1.5279, + "step": 3920 + }, + { + "epoch": 3.291457286432161, + "grad_norm": 0.5836303234100342, + "learning_rate": 0.0002, + "loss": 1.4899, + "step": 3930 + }, + { + "epoch": 3.2998324958123955, + "grad_norm": 0.6337724328041077, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3940 + }, + { + "epoch": 3.30820770519263, + "grad_norm": 0.6345084309577942, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 3950 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.6125303506851196, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3960 + }, + { + "epoch": 3.324958123953099, + "grad_norm": 0.6259911060333252, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 3970 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.645745575428009, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3980 + }, + { + "epoch": 3.341708542713568, + "grad_norm": 0.6666176915168762, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 3990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.59013831615448, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 4000 + }, + { + "epoch": 3.358458961474037, + "grad_norm": 0.6604634523391724, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 4010 + }, + { + "epoch": 3.3668341708542715, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 4020 + }, + { + "epoch": 3.375209380234506, + "grad_norm": 0.515724778175354, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 4030 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.681968092918396, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4040 + }, + { + "epoch": 3.391959798994975, + "grad_norm": 0.5978158116340637, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4050 + }, + { + "epoch": 3.4003350083752095, + "grad_norm": 0.6043432354927063, + "learning_rate": 0.0002, + "loss": 1.5449, + "step": 4060 + }, + { + "epoch": 3.408710217755444, + "grad_norm": 0.5899770855903625, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4070 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6014242172241211, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 4080 + }, + { + "epoch": 3.425460636515913, + "grad_norm": 0.5944811105728149, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 4090 + }, + { + "epoch": 3.4338358458961475, + "grad_norm": 0.6506822109222412, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 4100 + }, + { + "epoch": 3.442211055276382, + "grad_norm": 0.6926528811454773, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 4110 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.5646378993988037, + "learning_rate": 0.0002, + "loss": 1.5169, + "step": 4120 + }, + { + "epoch": 3.458961474036851, + "grad_norm": 0.7233654856681824, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 4130 + }, + { + "epoch": 3.4673366834170856, + "grad_norm": 0.6231815814971924, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4140 + }, + { + "epoch": 3.47571189279732, + "grad_norm": 0.6115689873695374, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 4150 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5812674760818481, + "learning_rate": 0.0002, + "loss": 1.4621, + "step": 4160 + }, + { + "epoch": 3.492462311557789, + "grad_norm": 0.6099632978439331, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 4170 + }, + { + "epoch": 3.5008375209380236, + "grad_norm": 0.6102647185325623, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 4180 + }, + { + "epoch": 3.509212730318258, + "grad_norm": 0.6034680008888245, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4190 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.6281666159629822, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 4200 + }, + { + "epoch": 3.525963149078727, + "grad_norm": 0.6245372295379639, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 4210 + }, + { + "epoch": 3.5343383584589616, + "grad_norm": 0.5897293090820312, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 4220 + }, + { + "epoch": 3.542713567839196, + "grad_norm": 0.601054847240448, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4230 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.7004473805427551, + "learning_rate": 0.0002, + "loss": 1.4974, + "step": 4240 + }, + { + "epoch": 3.559463986599665, + "grad_norm": 0.6601553559303284, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 4250 + }, + { + "epoch": 3.5678391959798996, + "grad_norm": 0.6112467050552368, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4260 + }, + { + "epoch": 3.576214405360134, + "grad_norm": 0.5902454853057861, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4270 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5792450904846191, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4280 + }, + { + "epoch": 3.592964824120603, + "grad_norm": 0.5923888087272644, + "learning_rate": 0.0002, + "loss": 1.4664, + "step": 4290 + }, + { + "epoch": 3.6013400335008376, + "grad_norm": 0.5869482159614563, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 4300 + }, + { + "epoch": 3.609715242881072, + "grad_norm": 0.6372929811477661, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 4310 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.6350686550140381, + "learning_rate": 0.0002, + "loss": 1.4977, + "step": 4320 + }, + { + "epoch": 3.626465661641541, + "grad_norm": 0.571819007396698, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4330 + }, + { + "epoch": 3.6348408710217757, + "grad_norm": 0.592250645160675, + "learning_rate": 0.0002, + "loss": 1.5414, + "step": 4340 + }, + { + "epoch": 3.64321608040201, + "grad_norm": 0.6110650897026062, + "learning_rate": 0.0002, + "loss": 1.4912, + "step": 4350 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.6187081336975098, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 4360 + }, + { + "epoch": 3.659966499162479, + "grad_norm": 0.6197671890258789, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 4370 + }, + { + "epoch": 3.6683417085427137, + "grad_norm": 0.6050862669944763, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4380 + }, + { + "epoch": 3.676716917922948, + "grad_norm": 0.621265172958374, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 4390 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.6552940011024475, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 4400 + }, + { + "epoch": 3.693467336683417, + "grad_norm": 0.5638861060142517, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 4410 + }, + { + "epoch": 3.7018425460636517, + "grad_norm": 0.6388863325119019, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4420 + }, + { + "epoch": 3.710217755443886, + "grad_norm": 0.6062559485435486, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 4430 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5800350308418274, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 4440 + }, + { + "epoch": 3.726968174204355, + "grad_norm": 0.5954474210739136, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 4450 + }, + { + "epoch": 3.7353433835845897, + "grad_norm": 0.5880125761032104, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 4460 + }, + { + "epoch": 3.7437185929648242, + "grad_norm": 0.5880921483039856, + "learning_rate": 0.0002, + "loss": 1.5055, + "step": 4470 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.5995073914527893, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 4480 + }, + { + "epoch": 3.7604690117252932, + "grad_norm": 0.5958493947982788, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 4490 + }, + { + "epoch": 3.7688442211055277, + "grad_norm": 0.5694711804389954, + "learning_rate": 0.0002, + "loss": 1.5472, + "step": 4500 + }, + { + "epoch": 3.7772194304857623, + "grad_norm": 0.6175141930580139, + "learning_rate": 0.0002, + "loss": 1.5105, + "step": 4510 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.5541581511497498, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 4520 + }, + { + "epoch": 3.7939698492462313, + "grad_norm": 0.5986164808273315, + "learning_rate": 0.0002, + "loss": 1.5283, + "step": 4530 + }, + { + "epoch": 3.8023450586264658, + "grad_norm": 0.640072226524353, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4540 + }, + { + "epoch": 3.8107202680067003, + "grad_norm": 0.5742579698562622, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4550 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.6658656001091003, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4560 + }, + { + "epoch": 3.8274706867671693, + "grad_norm": 0.5475369691848755, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 4570 + }, + { + "epoch": 3.835845896147404, + "grad_norm": 0.613172173500061, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 4580 + }, + { + "epoch": 3.8442211055276383, + "grad_norm": 0.590968132019043, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4590 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.5865461826324463, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4600 + }, + { + "epoch": 3.8609715242881073, + "grad_norm": 0.6815178990364075, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4610 + }, + { + "epoch": 3.869346733668342, + "grad_norm": 0.6551400423049927, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4620 + }, + { + "epoch": 3.8777219430485763, + "grad_norm": 0.6398897171020508, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 4630 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6761762499809265, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 4640 + }, + { + "epoch": 3.8944723618090453, + "grad_norm": 0.6277294754981995, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 4650 + }, + { + "epoch": 3.90284757118928, + "grad_norm": 0.6285301446914673, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 4660 + }, + { + "epoch": 3.9112227805695143, + "grad_norm": 0.5416069626808167, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4670 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.6314545273780823, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4680 + }, + { + "epoch": 3.9279731993299833, + "grad_norm": 0.604479968547821, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 4690 + }, + { + "epoch": 3.936348408710218, + "grad_norm": 0.5321660041809082, + "learning_rate": 0.0002, + "loss": 1.5186, + "step": 4700 + }, + { + "epoch": 3.9447236180904524, + "grad_norm": 0.6632516980171204, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 4710 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.5925896763801575, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 4720 + }, + { + "epoch": 3.9614740368509214, + "grad_norm": 0.6580308675765991, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 4730 + }, + { + "epoch": 3.969849246231156, + "grad_norm": 0.5578170418739319, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 4740 + }, + { + "epoch": 3.9782244556113904, + "grad_norm": 0.6216608285903931, + "learning_rate": 0.0002, + "loss": 1.5394, + "step": 4750 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.5693069696426392, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 4760 + }, + { + "epoch": 3.9949748743718594, + "grad_norm": 0.5353434681892395, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 4770 + }, + { + "epoch": 4.0, + "eval_loss": 1.8809821605682373, + "eval_runtime": 37.9695, + "eval_samples_per_second": 13.564, + "eval_steps_per_second": 1.712, + "step": 4776 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.210225861028741e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-4776/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10f2d529a37ef91f17df9a5604cc74f3f075309a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a919013d99ab45807e37c596ee39caa0a1296782aa9a6ff9e67b3a8179a3b79a +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f53e7f805161c8d88a07932fc7017bdf78948a1d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45913cc0661c6005cb73389ccc8d47b9b7b7082fed343b532cc26774f1a707dd +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f762b625dbb1ed4332e1bd0a56b8d2271eb13d5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3e294228736038ce5b1cdf36fda290914e49ccc2fb5997a703736bf3485da27 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dfc75f10e9f51117e5f561a5d0b15bda67213a3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:001d71f239fa6dc6a8238c31d4375718af287b278554a1c1582559d2cad2bb41 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..42fe1d16642d50586c186401b081776489644878 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/trainer_state.json @@ -0,0 +1,4252 @@ +{ + "best_metric": 1.8061236143112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 5970, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.4256260097026825, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1200 + }, + { + "epoch": 1.0134003350083751, + "grad_norm": 0.35050156712532043, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 1.0217755443886096, + "grad_norm": 0.34773948788642883, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 1220 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.35487470030784607, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.37040361762046814, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1240 + }, + { + "epoch": 1.0469011725293131, + "grad_norm": 0.33740508556365967, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1250 + }, + { + "epoch": 1.0552763819095476, + "grad_norm": 0.3962724506855011, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 1260 + }, + { + "epoch": 1.0636515912897822, + "grad_norm": 0.3129824101924896, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1270 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.3620055019855499, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1280 + }, + { + "epoch": 1.0804020100502512, + "grad_norm": 0.3480982184410095, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 1290 + }, + { + "epoch": 1.0887772194304857, + "grad_norm": 0.344424843788147, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1300 + }, + { + "epoch": 1.0971524288107202, + "grad_norm": 0.3480122685432434, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1310 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.323662132024765, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1320 + }, + { + "epoch": 1.1139028475711892, + "grad_norm": 0.35440102219581604, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1330 + }, + { + "epoch": 1.1222780569514237, + "grad_norm": 0.3342263698577881, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 1340 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.35705259442329407, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.38021907210350037, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1360 + }, + { + "epoch": 1.1474036850921272, + "grad_norm": 0.34918731451034546, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1370 + }, + { + "epoch": 1.1557788944723617, + "grad_norm": 0.371868371963501, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 1380 + }, + { + "epoch": 1.1641541038525962, + "grad_norm": 0.38413912057876587, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1390 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3898005187511444, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1400 + }, + { + "epoch": 1.1809045226130652, + "grad_norm": 0.3726498484611511, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 1410 + }, + { + "epoch": 1.1892797319932997, + "grad_norm": 0.3532905876636505, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 1420 + }, + { + "epoch": 1.1976549413735342, + "grad_norm": 0.338127464056015, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1430 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.3472749888896942, + "learning_rate": 0.0002, + "loss": 1.871, + "step": 1440 + }, + { + "epoch": 1.2144053601340032, + "grad_norm": 0.3523476719856262, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1450 + }, + { + "epoch": 1.2227805695142377, + "grad_norm": 0.42986124753952026, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 1460 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 0.38195517659187317, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 1470 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.31665122509002686, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 1480 + }, + { + "epoch": 1.2479061976549413, + "grad_norm": 0.3539541959762573, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 1490 + }, + { + "epoch": 1.2562814070351758, + "grad_norm": 0.40162816643714905, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1500 + }, + { + "epoch": 1.2646566164154103, + "grad_norm": 0.34727150201797485, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 1510 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3364993929862976, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1520 + }, + { + "epoch": 1.2814070351758793, + "grad_norm": 0.323483943939209, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 1530 + }, + { + "epoch": 1.2897822445561138, + "grad_norm": 0.4114733934402466, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 1540 + }, + { + "epoch": 1.2981574539363483, + "grad_norm": 0.37476620078086853, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1550 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.4216269552707672, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1560 + }, + { + "epoch": 1.3149078726968173, + "grad_norm": 0.3204927444458008, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1570 + }, + { + "epoch": 1.3232830820770518, + "grad_norm": 0.36916354298591614, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1580 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.3755691647529602, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 1590 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3688889443874359, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1600 + }, + { + "epoch": 1.3484087102177553, + "grad_norm": 0.34306398034095764, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 1610 + }, + { + "epoch": 1.3567839195979898, + "grad_norm": 0.3651525676250458, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 1620 + }, + { + "epoch": 1.3651591289782243, + "grad_norm": 0.3461526036262512, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1630 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37959185242652893, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1640 + }, + { + "epoch": 1.3819095477386933, + "grad_norm": 0.4005356431007385, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1650 + }, + { + "epoch": 1.3902847571189278, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 1660 + }, + { + "epoch": 1.3986599664991624, + "grad_norm": 0.38220855593681335, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 1670 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1680 + }, + { + "epoch": 1.4154103852596314, + "grad_norm": 0.40028059482574463, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 1690 + }, + { + "epoch": 1.4237855946398659, + "grad_norm": 0.3953610360622406, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 1700 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.39524543285369873, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1710 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.37721359729766846, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 1720 + }, + { + "epoch": 1.4489112227805694, + "grad_norm": 0.4220093786716461, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 1730 + }, + { + "epoch": 1.457286432160804, + "grad_norm": 0.3876369595527649, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 1740 + }, + { + "epoch": 1.4656616415410384, + "grad_norm": 0.3774619400501251, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1750 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.3608052432537079, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 1760 + }, + { + "epoch": 1.4824120603015074, + "grad_norm": 0.32083916664123535, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 1770 + }, + { + "epoch": 1.490787269681742, + "grad_norm": 0.32290884852409363, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1780 + }, + { + "epoch": 1.4991624790619764, + "grad_norm": 0.3537974953651428, + "learning_rate": 0.0002, + "loss": 1.7648, + "step": 1790 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.36576104164123535, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 1800 + }, + { + "epoch": 1.5159128978224454, + "grad_norm": 0.3336752653121948, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1810 + }, + { + "epoch": 1.52428810720268, + "grad_norm": 0.3551652431488037, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1820 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.43313586711883545, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 1830 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.39160311222076416, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 1840 + }, + { + "epoch": 1.5494137353433834, + "grad_norm": 0.38758179545402527, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1850 + }, + { + "epoch": 1.557788944723618, + "grad_norm": 0.3658832013607025, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 1860 + }, + { + "epoch": 1.5661641541038525, + "grad_norm": 0.375372052192688, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1870 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3586942255496979, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1880 + }, + { + "epoch": 1.5829145728643215, + "grad_norm": 0.3626467287540436, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1890 + }, + { + "epoch": 1.591289782244556, + "grad_norm": 0.4199363589286804, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1900 + }, + { + "epoch": 1.5996649916247905, + "grad_norm": 0.35646331310272217, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 1910 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3465106189250946, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1920 + }, + { + "epoch": 1.6164154103852595, + "grad_norm": 0.43392884731292725, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 1930 + }, + { + "epoch": 1.624790619765494, + "grad_norm": 0.39187198877334595, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 1940 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.3685080409049988, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1950 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.4044491946697235, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 1960 + }, + { + "epoch": 1.6499162479061975, + "grad_norm": 0.4388049244880676, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 1970 + }, + { + "epoch": 1.658291457286432, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 1980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3501148521900177, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1990 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3751881718635559, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2000 + }, + { + "epoch": 1.6834170854271355, + "grad_norm": 0.3902788460254669, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 2010 + }, + { + "epoch": 1.69179229480737, + "grad_norm": 0.39642134308815, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 1.7001675041876045, + "grad_norm": 0.35721203684806824, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 2030 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.360419899225235, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2040 + }, + { + "epoch": 1.7169179229480735, + "grad_norm": 0.3755600154399872, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 2050 + }, + { + "epoch": 1.725293132328308, + "grad_norm": 0.3939184844493866, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 2060 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.33955490589141846, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 2070 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.35501939058303833, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2080 + }, + { + "epoch": 1.7504187604690116, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 2090 + }, + { + "epoch": 1.758793969849246, + "grad_norm": 0.3472785949707031, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 2100 + }, + { + "epoch": 1.7671691792294806, + "grad_norm": 0.3620430827140808, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2110 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 2120 + }, + { + "epoch": 1.7839195979899496, + "grad_norm": 0.3662523925304413, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 2130 + }, + { + "epoch": 1.792294807370184, + "grad_norm": 0.4113886058330536, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2140 + }, + { + "epoch": 1.8006700167504186, + "grad_norm": 0.3765672743320465, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2150 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.41623714566230774, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 2160 + }, + { + "epoch": 1.8174204355108876, + "grad_norm": 0.3724099099636078, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2170 + }, + { + "epoch": 1.8257956448911221, + "grad_norm": 0.3990779221057892, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 2180 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.3677702844142914, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 2190 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.3944959342479706, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 2200 + }, + { + "epoch": 1.8509212730318256, + "grad_norm": 0.3413957357406616, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2210 + }, + { + "epoch": 1.8592964824120601, + "grad_norm": 0.40136098861694336, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 2220 + }, + { + "epoch": 1.8676716917922946, + "grad_norm": 0.3496319055557251, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 2230 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3759860694408417, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2240 + }, + { + "epoch": 1.8844221105527639, + "grad_norm": 0.43556007742881775, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 2250 + }, + { + "epoch": 1.8927973199329984, + "grad_norm": 0.3864828944206238, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 2260 + }, + { + "epoch": 1.9011725293132329, + "grad_norm": 0.396930456161499, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 2270 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37667879462242126, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 2280 + }, + { + "epoch": 1.917922948073702, + "grad_norm": 0.3539164066314697, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 2290 + }, + { + "epoch": 1.9262981574539364, + "grad_norm": 0.40542101860046387, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 2300 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.37341606616973877, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 2310 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.4011504352092743, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.95142378559464, + "grad_norm": 0.37934592366218567, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 2330 + }, + { + "epoch": 1.9597989949748744, + "grad_norm": 0.32745009660720825, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 2340 + }, + { + "epoch": 1.968174204355109, + "grad_norm": 0.38347750902175903, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2350 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3945120871067047, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 2360 + }, + { + "epoch": 1.984924623115578, + "grad_norm": 0.4034058749675751, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 2370 + }, + { + "epoch": 1.9932998324958124, + "grad_norm": 0.3546718955039978, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.8061236143112183, + "eval_runtime": 38.2113, + "eval_samples_per_second": 13.478, + "eval_steps_per_second": 1.701, + "step": 2388 + }, + { + "epoch": 2.0016750418760467, + "grad_norm": 0.35184019804000854, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 2390 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.40416669845581055, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2400 + }, + { + "epoch": 2.0184254606365157, + "grad_norm": 0.3824569880962372, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2410 + }, + { + "epoch": 2.0268006700167502, + "grad_norm": 0.42036163806915283, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2420 + }, + { + "epoch": 2.0351758793969847, + "grad_norm": 0.40417996048927307, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2430 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.45298922061920166, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 2440 + }, + { + "epoch": 2.0519262981574538, + "grad_norm": 0.48289841413497925, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 2450 + }, + { + "epoch": 2.0603015075376883, + "grad_norm": 0.43702399730682373, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 2460 + }, + { + "epoch": 2.0686767169179228, + "grad_norm": 0.49487054347991943, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2470 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.40030500292778015, + "learning_rate": 0.0002, + "loss": 1.6552, + "step": 2480 + }, + { + "epoch": 2.0854271356783918, + "grad_norm": 0.4664880037307739, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 2490 + }, + { + "epoch": 2.0938023450586263, + "grad_norm": 0.4111400842666626, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 2500 + }, + { + "epoch": 2.102177554438861, + "grad_norm": 0.4155750572681427, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 2510 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.39257505536079407, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2520 + }, + { + "epoch": 2.11892797319933, + "grad_norm": 0.4156777560710907, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 2530 + }, + { + "epoch": 2.1273031825795643, + "grad_norm": 0.4025181233882904, + "learning_rate": 0.0002, + "loss": 1.6695, + "step": 2540 + }, + { + "epoch": 2.135678391959799, + "grad_norm": 0.42347562313079834, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2550 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.47068294882774353, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2560 + }, + { + "epoch": 2.152428810720268, + "grad_norm": 0.44081777334213257, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 2570 + }, + { + "epoch": 2.1608040201005023, + "grad_norm": 0.44823798537254333, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2580 + }, + { + "epoch": 2.169179229480737, + "grad_norm": 0.40486326813697815, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2590 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.454236775636673, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2600 + }, + { + "epoch": 2.185929648241206, + "grad_norm": 0.42555344104766846, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2610 + }, + { + "epoch": 2.1943048576214403, + "grad_norm": 0.5607381463050842, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2620 + }, + { + "epoch": 2.202680067001675, + "grad_norm": 0.4095611870288849, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 2630 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.419342577457428, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2640 + }, + { + "epoch": 2.219430485762144, + "grad_norm": 0.48541849851608276, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2650 + }, + { + "epoch": 2.2278056951423784, + "grad_norm": 0.4365246891975403, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 2660 + }, + { + "epoch": 2.236180904522613, + "grad_norm": 0.46417000889778137, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 2670 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5034580230712891, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 2680 + }, + { + "epoch": 2.2529313232830823, + "grad_norm": 0.44852879643440247, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2690 + }, + { + "epoch": 2.2613065326633164, + "grad_norm": 0.43886998295783997, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 2700 + }, + { + "epoch": 2.2696817420435513, + "grad_norm": 0.45762625336647034, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 2710 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.39429017901420593, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2720 + }, + { + "epoch": 2.2864321608040203, + "grad_norm": 0.4420442581176758, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2730 + }, + { + "epoch": 2.2948073701842544, + "grad_norm": 0.4327794015407562, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 2740 + }, + { + "epoch": 2.3031825795644894, + "grad_norm": 0.4303780198097229, + "learning_rate": 0.0002, + "loss": 1.6405, + "step": 2750 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.41379377245903015, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 2760 + }, + { + "epoch": 2.3199329983249584, + "grad_norm": 0.4821205735206604, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 2770 + }, + { + "epoch": 2.3283082077051924, + "grad_norm": 0.46232181787490845, + "learning_rate": 0.0002, + "loss": 1.6694, + "step": 2780 + }, + { + "epoch": 2.3366834170854274, + "grad_norm": 0.44937554001808167, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 2790 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.443250447511673, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2800 + }, + { + "epoch": 2.3534338358458964, + "grad_norm": 0.4687805473804474, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2810 + }, + { + "epoch": 2.3618090452261304, + "grad_norm": 0.435031920671463, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2820 + }, + { + "epoch": 2.3701842546063654, + "grad_norm": 0.4949858784675598, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 2830 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.46349018812179565, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 2840 + }, + { + "epoch": 2.3869346733668344, + "grad_norm": 0.46377238631248474, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2850 + }, + { + "epoch": 2.3953098827470685, + "grad_norm": 0.6111940741539001, + "learning_rate": 0.0002, + "loss": 1.5384, + "step": 2860 + }, + { + "epoch": 2.4036850921273034, + "grad_norm": 0.45090532302856445, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2870 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.4762120842933655, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2880 + }, + { + "epoch": 2.4204355108877724, + "grad_norm": 0.4397919774055481, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 2890 + }, + { + "epoch": 2.4288107202680065, + "grad_norm": 0.4765152335166931, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2900 + }, + { + "epoch": 2.4371859296482414, + "grad_norm": 0.4347304403781891, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2910 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.3918324410915375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 2920 + }, + { + "epoch": 2.4539363484087104, + "grad_norm": 0.43932855129241943, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 2930 + }, + { + "epoch": 2.4623115577889445, + "grad_norm": 0.46946918964385986, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 2940 + }, + { + "epoch": 2.4706867671691795, + "grad_norm": 0.45169174671173096, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2950 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.43488186597824097, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2960 + }, + { + "epoch": 2.4874371859296485, + "grad_norm": 0.42297765612602234, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2970 + }, + { + "epoch": 2.4958123953098825, + "grad_norm": 0.4546392560005188, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2980 + }, + { + "epoch": 2.5041876046901175, + "grad_norm": 0.4236692488193512, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 2990 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.46421024203300476, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 3000 + }, + { + "epoch": 2.5209380234505865, + "grad_norm": 0.5040220618247986, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 3010 + }, + { + "epoch": 2.5293132328308205, + "grad_norm": 0.4596138894557953, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.5376884422110555, + "grad_norm": 0.4410228729248047, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3030 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.553693413734436, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3040 + }, + { + "epoch": 2.5544388609715245, + "grad_norm": 0.41298043727874756, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 3050 + }, + { + "epoch": 2.5628140703517586, + "grad_norm": 0.4894513487815857, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 3060 + }, + { + "epoch": 2.5711892797319935, + "grad_norm": 0.5525603294372559, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3070 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.5043630003929138, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3080 + }, + { + "epoch": 2.5879396984924625, + "grad_norm": 0.4690920412540436, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 3090 + }, + { + "epoch": 2.5963149078726966, + "grad_norm": 0.4358677566051483, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 3100 + }, + { + "epoch": 2.6046901172529315, + "grad_norm": 0.4621894061565399, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 3110 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.4639507532119751, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 3120 + }, + { + "epoch": 2.6214405360134005, + "grad_norm": 0.45161309838294983, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 3130 + }, + { + "epoch": 2.6298157453936346, + "grad_norm": 0.49179261922836304, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 3140 + }, + { + "epoch": 2.6381909547738696, + "grad_norm": 0.4739720821380615, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 3150 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.468252956867218, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3160 + }, + { + "epoch": 2.6549413735343386, + "grad_norm": 0.44691553711891174, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 3170 + }, + { + "epoch": 2.6633165829145726, + "grad_norm": 0.47537046670913696, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 3180 + }, + { + "epoch": 2.6716917922948076, + "grad_norm": 0.4445202052593231, + "learning_rate": 0.0002, + "loss": 1.6755, + "step": 3190 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.46785518527030945, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3200 + }, + { + "epoch": 2.6884422110552766, + "grad_norm": 0.4807088077068329, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 3210 + }, + { + "epoch": 2.6968174204355106, + "grad_norm": 0.4547516703605652, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 3220 + }, + { + "epoch": 2.7051926298157456, + "grad_norm": 0.5200821161270142, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 3230 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.4915551245212555, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3240 + }, + { + "epoch": 2.7219430485762146, + "grad_norm": 0.4324817955493927, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3250 + }, + { + "epoch": 2.7303182579564487, + "grad_norm": 0.6290464997291565, + "learning_rate": 0.0002, + "loss": 1.6154, + "step": 3260 + }, + { + "epoch": 2.7386934673366836, + "grad_norm": 0.42255541682243347, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 3270 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.47089505195617676, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 3280 + }, + { + "epoch": 2.7554438860971526, + "grad_norm": 0.4492960572242737, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3290 + }, + { + "epoch": 2.7638190954773867, + "grad_norm": 0.4711938202381134, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3300 + }, + { + "epoch": 2.7721943048576216, + "grad_norm": 0.4635316729545593, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 3310 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.4207742512226105, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3320 + }, + { + "epoch": 2.7889447236180906, + "grad_norm": 0.5545504093170166, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3330 + }, + { + "epoch": 2.7973199329983247, + "grad_norm": 0.46976953744888306, + "learning_rate": 0.0002, + "loss": 1.6642, + "step": 3340 + }, + { + "epoch": 2.8056951423785597, + "grad_norm": 0.4805937111377716, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 3350 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.4986467659473419, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 3360 + }, + { + "epoch": 2.8224455611390287, + "grad_norm": 0.44702932238578796, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 3370 + }, + { + "epoch": 2.8308207705192627, + "grad_norm": 0.4698854088783264, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 3380 + }, + { + "epoch": 2.8391959798994977, + "grad_norm": 0.5756528377532959, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 3390 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.4266531765460968, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 3400 + }, + { + "epoch": 2.8559463986599667, + "grad_norm": 0.5342442989349365, + "learning_rate": 0.0002, + "loss": 1.6351, + "step": 3410 + }, + { + "epoch": 2.8643216080402008, + "grad_norm": 0.47210443019866943, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 3420 + }, + { + "epoch": 2.8726968174204357, + "grad_norm": 0.4491795599460602, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3430 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5387647151947021, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 3440 + }, + { + "epoch": 2.8894472361809047, + "grad_norm": 0.5059208273887634, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3450 + }, + { + "epoch": 2.8978224455611388, + "grad_norm": 0.472605437040329, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3460 + }, + { + "epoch": 2.9061976549413737, + "grad_norm": 0.499795138835907, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3470 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.4887969493865967, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 3480 + }, + { + "epoch": 2.9229480737018427, + "grad_norm": 0.4670022130012512, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.931323283082077, + "grad_norm": 0.4475444555282593, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 3500 + }, + { + "epoch": 2.9396984924623117, + "grad_norm": 0.39244669675827026, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 3510 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4905056059360504, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3520 + }, + { + "epoch": 2.9564489112227808, + "grad_norm": 0.4395551085472107, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 3530 + }, + { + "epoch": 2.964824120603015, + "grad_norm": 0.4693661034107208, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3540 + }, + { + "epoch": 2.9731993299832498, + "grad_norm": 0.473781943321228, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 3550 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4374050796031952, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3560 + }, + { + "epoch": 2.9899497487437188, + "grad_norm": 0.46144190430641174, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 3570 + }, + { + "epoch": 2.998324958123953, + "grad_norm": 0.43887680768966675, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 3580 + }, + { + "epoch": 3.0, + "eval_loss": 1.8283122777938843, + "eval_runtime": 38.023, + "eval_samples_per_second": 13.544, + "eval_steps_per_second": 1.709, + "step": 3582 + }, + { + "epoch": 3.006700167504188, + "grad_norm": 0.6784713268280029, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3590 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5783940553665161, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 3600 + }, + { + "epoch": 3.023450586264657, + "grad_norm": 0.5408937335014343, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 3610 + }, + { + "epoch": 3.0318257956448913, + "grad_norm": 0.5229013562202454, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3620 + }, + { + "epoch": 3.040201005025126, + "grad_norm": 0.49160143733024597, + "learning_rate": 0.0002, + "loss": 1.4835, + "step": 3630 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.6563201546669006, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 3640 + }, + { + "epoch": 3.056951423785595, + "grad_norm": 0.5686020851135254, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3650 + }, + { + "epoch": 3.0653266331658293, + "grad_norm": 0.5774043202400208, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3660 + }, + { + "epoch": 3.073701842546064, + "grad_norm": 0.6106171011924744, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 3670 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.517433226108551, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3680 + }, + { + "epoch": 3.090452261306533, + "grad_norm": 0.5681702494621277, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3690 + }, + { + "epoch": 3.0988274706867673, + "grad_norm": 0.5769233107566833, + "learning_rate": 0.0002, + "loss": 1.4731, + "step": 3700 + }, + { + "epoch": 3.107202680067002, + "grad_norm": 0.5657462477684021, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 3710 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.6035246253013611, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3720 + }, + { + "epoch": 3.123953098827471, + "grad_norm": 0.7286643385887146, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 3730 + }, + { + "epoch": 3.1323283082077054, + "grad_norm": 0.5121201872825623, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 3740 + }, + { + "epoch": 3.14070351758794, + "grad_norm": 0.5074213147163391, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 3750 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.57481849193573, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 3760 + }, + { + "epoch": 3.157453936348409, + "grad_norm": 0.6326663494110107, + "learning_rate": 0.0002, + "loss": 1.4765, + "step": 3770 + }, + { + "epoch": 3.1658291457286434, + "grad_norm": 0.6039315462112427, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3780 + }, + { + "epoch": 3.174204355108878, + "grad_norm": 0.6936715245246887, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 3790 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.6516796946525574, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3800 + }, + { + "epoch": 3.190954773869347, + "grad_norm": 0.6140730977058411, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3810 + }, + { + "epoch": 3.1993299832495814, + "grad_norm": 0.631328284740448, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3820 + }, + { + "epoch": 3.207705192629816, + "grad_norm": 0.6265402436256409, + "learning_rate": 0.0002, + "loss": 1.4844, + "step": 3830 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6649428606033325, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3840 + }, + { + "epoch": 3.224455611390285, + "grad_norm": 0.5329259634017944, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 3850 + }, + { + "epoch": 3.2328308207705194, + "grad_norm": 0.6008304953575134, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 3860 + }, + { + "epoch": 3.241206030150754, + "grad_norm": 0.5918582081794739, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 3870 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.643622100353241, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3880 + }, + { + "epoch": 3.257956448911223, + "grad_norm": 0.5517964363098145, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 3890 + }, + { + "epoch": 3.2663316582914574, + "grad_norm": 0.6780755519866943, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 3900 + }, + { + "epoch": 3.274706867671692, + "grad_norm": 0.6742202639579773, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3910 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.6228749752044678, + "learning_rate": 0.0002, + "loss": 1.5279, + "step": 3920 + }, + { + "epoch": 3.291457286432161, + "grad_norm": 0.5836303234100342, + "learning_rate": 0.0002, + "loss": 1.4899, + "step": 3930 + }, + { + "epoch": 3.2998324958123955, + "grad_norm": 0.6337724328041077, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3940 + }, + { + "epoch": 3.30820770519263, + "grad_norm": 0.6345084309577942, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 3950 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.6125303506851196, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3960 + }, + { + "epoch": 3.324958123953099, + "grad_norm": 0.6259911060333252, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 3970 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.645745575428009, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3980 + }, + { + "epoch": 3.341708542713568, + "grad_norm": 0.6666176915168762, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 3990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.59013831615448, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 4000 + }, + { + "epoch": 3.358458961474037, + "grad_norm": 0.6604634523391724, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 4010 + }, + { + "epoch": 3.3668341708542715, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 4020 + }, + { + "epoch": 3.375209380234506, + "grad_norm": 0.515724778175354, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 4030 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.681968092918396, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4040 + }, + { + "epoch": 3.391959798994975, + "grad_norm": 0.5978158116340637, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4050 + }, + { + "epoch": 3.4003350083752095, + "grad_norm": 0.6043432354927063, + "learning_rate": 0.0002, + "loss": 1.5449, + "step": 4060 + }, + { + "epoch": 3.408710217755444, + "grad_norm": 0.5899770855903625, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4070 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6014242172241211, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 4080 + }, + { + "epoch": 3.425460636515913, + "grad_norm": 0.5944811105728149, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 4090 + }, + { + "epoch": 3.4338358458961475, + "grad_norm": 0.6506822109222412, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 4100 + }, + { + "epoch": 3.442211055276382, + "grad_norm": 0.6926528811454773, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 4110 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.5646378993988037, + "learning_rate": 0.0002, + "loss": 1.5169, + "step": 4120 + }, + { + "epoch": 3.458961474036851, + "grad_norm": 0.7233654856681824, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 4130 + }, + { + "epoch": 3.4673366834170856, + "grad_norm": 0.6231815814971924, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4140 + }, + { + "epoch": 3.47571189279732, + "grad_norm": 0.6115689873695374, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 4150 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5812674760818481, + "learning_rate": 0.0002, + "loss": 1.4621, + "step": 4160 + }, + { + "epoch": 3.492462311557789, + "grad_norm": 0.6099632978439331, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 4170 + }, + { + "epoch": 3.5008375209380236, + "grad_norm": 0.6102647185325623, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 4180 + }, + { + "epoch": 3.509212730318258, + "grad_norm": 0.6034680008888245, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4190 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.6281666159629822, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 4200 + }, + { + "epoch": 3.525963149078727, + "grad_norm": 0.6245372295379639, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 4210 + }, + { + "epoch": 3.5343383584589616, + "grad_norm": 0.5897293090820312, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 4220 + }, + { + "epoch": 3.542713567839196, + "grad_norm": 0.601054847240448, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4230 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.7004473805427551, + "learning_rate": 0.0002, + "loss": 1.4974, + "step": 4240 + }, + { + "epoch": 3.559463986599665, + "grad_norm": 0.6601553559303284, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 4250 + }, + { + "epoch": 3.5678391959798996, + "grad_norm": 0.6112467050552368, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4260 + }, + { + "epoch": 3.576214405360134, + "grad_norm": 0.5902454853057861, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4270 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5792450904846191, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4280 + }, + { + "epoch": 3.592964824120603, + "grad_norm": 0.5923888087272644, + "learning_rate": 0.0002, + "loss": 1.4664, + "step": 4290 + }, + { + "epoch": 3.6013400335008376, + "grad_norm": 0.5869482159614563, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 4300 + }, + { + "epoch": 3.609715242881072, + "grad_norm": 0.6372929811477661, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 4310 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.6350686550140381, + "learning_rate": 0.0002, + "loss": 1.4977, + "step": 4320 + }, + { + "epoch": 3.626465661641541, + "grad_norm": 0.571819007396698, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4330 + }, + { + "epoch": 3.6348408710217757, + "grad_norm": 0.592250645160675, + "learning_rate": 0.0002, + "loss": 1.5414, + "step": 4340 + }, + { + "epoch": 3.64321608040201, + "grad_norm": 0.6110650897026062, + "learning_rate": 0.0002, + "loss": 1.4912, + "step": 4350 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.6187081336975098, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 4360 + }, + { + "epoch": 3.659966499162479, + "grad_norm": 0.6197671890258789, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 4370 + }, + { + "epoch": 3.6683417085427137, + "grad_norm": 0.6050862669944763, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4380 + }, + { + "epoch": 3.676716917922948, + "grad_norm": 0.621265172958374, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 4390 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.6552940011024475, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 4400 + }, + { + "epoch": 3.693467336683417, + "grad_norm": 0.5638861060142517, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 4410 + }, + { + "epoch": 3.7018425460636517, + "grad_norm": 0.6388863325119019, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4420 + }, + { + "epoch": 3.710217755443886, + "grad_norm": 0.6062559485435486, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 4430 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5800350308418274, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 4440 + }, + { + "epoch": 3.726968174204355, + "grad_norm": 0.5954474210739136, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 4450 + }, + { + "epoch": 3.7353433835845897, + "grad_norm": 0.5880125761032104, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 4460 + }, + { + "epoch": 3.7437185929648242, + "grad_norm": 0.5880921483039856, + "learning_rate": 0.0002, + "loss": 1.5055, + "step": 4470 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.5995073914527893, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 4480 + }, + { + "epoch": 3.7604690117252932, + "grad_norm": 0.5958493947982788, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 4490 + }, + { + "epoch": 3.7688442211055277, + "grad_norm": 0.5694711804389954, + "learning_rate": 0.0002, + "loss": 1.5472, + "step": 4500 + }, + { + "epoch": 3.7772194304857623, + "grad_norm": 0.6175141930580139, + "learning_rate": 0.0002, + "loss": 1.5105, + "step": 4510 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.5541581511497498, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 4520 + }, + { + "epoch": 3.7939698492462313, + "grad_norm": 0.5986164808273315, + "learning_rate": 0.0002, + "loss": 1.5283, + "step": 4530 + }, + { + "epoch": 3.8023450586264658, + "grad_norm": 0.640072226524353, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4540 + }, + { + "epoch": 3.8107202680067003, + "grad_norm": 0.5742579698562622, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4550 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.6658656001091003, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4560 + }, + { + "epoch": 3.8274706867671693, + "grad_norm": 0.5475369691848755, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 4570 + }, + { + "epoch": 3.835845896147404, + "grad_norm": 0.613172173500061, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 4580 + }, + { + "epoch": 3.8442211055276383, + "grad_norm": 0.590968132019043, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4590 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.5865461826324463, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4600 + }, + { + "epoch": 3.8609715242881073, + "grad_norm": 0.6815178990364075, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4610 + }, + { + "epoch": 3.869346733668342, + "grad_norm": 0.6551400423049927, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4620 + }, + { + "epoch": 3.8777219430485763, + "grad_norm": 0.6398897171020508, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 4630 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6761762499809265, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 4640 + }, + { + "epoch": 3.8944723618090453, + "grad_norm": 0.6277294754981995, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 4650 + }, + { + "epoch": 3.90284757118928, + "grad_norm": 0.6285301446914673, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 4660 + }, + { + "epoch": 3.9112227805695143, + "grad_norm": 0.5416069626808167, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4670 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.6314545273780823, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4680 + }, + { + "epoch": 3.9279731993299833, + "grad_norm": 0.604479968547821, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 4690 + }, + { + "epoch": 3.936348408710218, + "grad_norm": 0.5321660041809082, + "learning_rate": 0.0002, + "loss": 1.5186, + "step": 4700 + }, + { + "epoch": 3.9447236180904524, + "grad_norm": 0.6632516980171204, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 4710 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.5925896763801575, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 4720 + }, + { + "epoch": 3.9614740368509214, + "grad_norm": 0.6580308675765991, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 4730 + }, + { + "epoch": 3.969849246231156, + "grad_norm": 0.5578170418739319, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 4740 + }, + { + "epoch": 3.9782244556113904, + "grad_norm": 0.6216608285903931, + "learning_rate": 0.0002, + "loss": 1.5394, + "step": 4750 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.5693069696426392, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 4760 + }, + { + "epoch": 3.9949748743718594, + "grad_norm": 0.5353434681892395, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 4770 + }, + { + "epoch": 4.0, + "eval_loss": 1.8809821605682373, + "eval_runtime": 37.9695, + "eval_samples_per_second": 13.564, + "eval_steps_per_second": 1.712, + "step": 4776 + }, + { + "epoch": 4.0033500837520934, + "grad_norm": 0.6117817759513855, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4780 + }, + { + "epoch": 4.011725293132328, + "grad_norm": 0.6816073656082153, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 4790 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.715548038482666, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 4800 + }, + { + "epoch": 4.028475711892797, + "grad_norm": 0.8585814833641052, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 4810 + }, + { + "epoch": 4.0368509212730315, + "grad_norm": 0.7372158765792847, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 4820 + }, + { + "epoch": 4.045226130653266, + "grad_norm": 0.8915117979049683, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4830 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.9323588013648987, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 4840 + }, + { + "epoch": 4.061976549413735, + "grad_norm": 0.9298437237739563, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4850 + }, + { + "epoch": 4.0703517587939695, + "grad_norm": 0.8541792035102844, + "learning_rate": 0.0002, + "loss": 1.4173, + "step": 4860 + }, + { + "epoch": 4.078726968174204, + "grad_norm": 0.7833571434020996, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 4870 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.9325295090675354, + "learning_rate": 0.0002, + "loss": 1.3835, + "step": 4880 + }, + { + "epoch": 4.0954773869346734, + "grad_norm": 0.7066370248794556, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 4890 + }, + { + "epoch": 4.1038525963149075, + "grad_norm": 0.712640643119812, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 4900 + }, + { + "epoch": 4.1122278056951425, + "grad_norm": 0.6970218420028687, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4910 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.7979312539100647, + "learning_rate": 0.0002, + "loss": 1.3805, + "step": 4920 + }, + { + "epoch": 4.1289782244556115, + "grad_norm": 0.7801558375358582, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4930 + }, + { + "epoch": 4.1373534338358455, + "grad_norm": 0.7505159974098206, + "learning_rate": 0.0002, + "loss": 1.3288, + "step": 4940 + }, + { + "epoch": 4.1457286432160805, + "grad_norm": 0.738201916217804, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 4950 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7736659049987793, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4960 + }, + { + "epoch": 4.1624790619765495, + "grad_norm": 0.7850064635276794, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 4970 + }, + { + "epoch": 4.1708542713567835, + "grad_norm": 0.8316620588302612, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 4980 + }, + { + "epoch": 4.1792294807370185, + "grad_norm": 0.7217330932617188, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 4990 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.7050199508666992, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 5000 + }, + { + "epoch": 4.1959798994974875, + "grad_norm": 0.6992659568786621, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 5010 + }, + { + "epoch": 4.204355108877722, + "grad_norm": 0.7648445963859558, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 5020 + }, + { + "epoch": 4.2127303182579565, + "grad_norm": 0.8093137741088867, + "learning_rate": 0.0002, + "loss": 1.3339, + "step": 5030 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.6907750368118286, + "learning_rate": 0.0002, + "loss": 1.37, + "step": 5040 + }, + { + "epoch": 4.2294807370184255, + "grad_norm": 0.7000078558921814, + "learning_rate": 0.0002, + "loss": 1.4231, + "step": 5050 + }, + { + "epoch": 4.23785594639866, + "grad_norm": 0.715034008026123, + "learning_rate": 0.0002, + "loss": 1.3411, + "step": 5060 + }, + { + "epoch": 4.2462311557788945, + "grad_norm": 0.828895628452301, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5070 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7127292156219482, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 5080 + }, + { + "epoch": 4.2629815745393635, + "grad_norm": 0.8256623148918152, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 5090 + }, + { + "epoch": 4.271356783919598, + "grad_norm": 0.8062452077865601, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5100 + }, + { + "epoch": 4.279731993299833, + "grad_norm": 0.6861081123352051, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5110 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 0.7566041350364685, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 5120 + }, + { + "epoch": 4.296482412060302, + "grad_norm": 0.8734753727912903, + "learning_rate": 0.0002, + "loss": 1.4571, + "step": 5130 + }, + { + "epoch": 4.304857621440536, + "grad_norm": 0.8559320569038391, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5140 + }, + { + "epoch": 4.313232830820771, + "grad_norm": 0.6965576410293579, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 5150 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.8277813792228699, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 5160 + }, + { + "epoch": 4.32998324958124, + "grad_norm": 1.0733633041381836, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5170 + }, + { + "epoch": 4.338358458961474, + "grad_norm": 0.7914809584617615, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5180 + }, + { + "epoch": 4.346733668341709, + "grad_norm": 0.8307849168777466, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 5190 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.7066516280174255, + "learning_rate": 0.0002, + "loss": 1.4318, + "step": 5200 + }, + { + "epoch": 4.363484087102178, + "grad_norm": 0.9676792025566101, + "learning_rate": 0.0002, + "loss": 1.3866, + "step": 5210 + }, + { + "epoch": 4.371859296482412, + "grad_norm": 0.7672301530838013, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5220 + }, + { + "epoch": 4.380234505862647, + "grad_norm": 0.6888260245323181, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5230 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.8775295615196228, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 5240 + }, + { + "epoch": 4.396984924623116, + "grad_norm": 0.8742642998695374, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 5250 + }, + { + "epoch": 4.40536013400335, + "grad_norm": 0.6935433745384216, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 5260 + }, + { + "epoch": 4.413735343383585, + "grad_norm": 0.7726178169250488, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5270 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.7493860721588135, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5280 + }, + { + "epoch": 4.430485762144054, + "grad_norm": 0.7758517265319824, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 5290 + }, + { + "epoch": 4.438860971524288, + "grad_norm": 0.779315173625946, + "learning_rate": 0.0002, + "loss": 1.2916, + "step": 5300 + }, + { + "epoch": 4.447236180904523, + "grad_norm": 0.7753667235374451, + "learning_rate": 0.0002, + "loss": 1.4483, + "step": 5310 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.8738188743591309, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 5320 + }, + { + "epoch": 4.463986599664992, + "grad_norm": 0.8410757184028625, + "learning_rate": 0.0002, + "loss": 1.41, + "step": 5330 + }, + { + "epoch": 4.472361809045226, + "grad_norm": 0.728897750377655, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 5340 + }, + { + "epoch": 4.480737018425461, + "grad_norm": 0.7880531549453735, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 5350 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.8455142378807068, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 5360 + }, + { + "epoch": 4.49748743718593, + "grad_norm": 0.8527868986129761, + "learning_rate": 0.0002, + "loss": 1.431, + "step": 5370 + }, + { + "epoch": 4.505862646566165, + "grad_norm": 0.7743009328842163, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5380 + }, + { + "epoch": 4.514237855946399, + "grad_norm": 0.7555320858955383, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 5390 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.8146619200706482, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5400 + }, + { + "epoch": 4.530988274706868, + "grad_norm": 0.8042502999305725, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5410 + }, + { + "epoch": 4.539363484087103, + "grad_norm": 0.7329140305519104, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5420 + }, + { + "epoch": 4.547738693467337, + "grad_norm": 0.7574753165245056, + "learning_rate": 0.0002, + "loss": 1.3946, + "step": 5430 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 1.1223409175872803, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 5440 + }, + { + "epoch": 4.564489112227806, + "grad_norm": 0.7647369503974915, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5450 + }, + { + "epoch": 4.572864321608041, + "grad_norm": 0.9135531187057495, + "learning_rate": 0.0002, + "loss": 1.4569, + "step": 5460 + }, + { + "epoch": 4.581239530988275, + "grad_norm": 0.9343693852424622, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5470 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.869945764541626, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 5480 + }, + { + "epoch": 4.597989949748744, + "grad_norm": 0.7383785843849182, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5490 + }, + { + "epoch": 4.606365159128979, + "grad_norm": 0.7988699674606323, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5500 + }, + { + "epoch": 4.614740368509213, + "grad_norm": 0.8731256127357483, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 5510 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.7577664256095886, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5520 + }, + { + "epoch": 4.631490787269682, + "grad_norm": 0.7825039625167847, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5530 + }, + { + "epoch": 4.639865996649917, + "grad_norm": 0.8534902930259705, + "learning_rate": 0.0002, + "loss": 1.4594, + "step": 5540 + }, + { + "epoch": 4.648241206030151, + "grad_norm": 0.7403318285942078, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 5550 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.8229990005493164, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5560 + }, + { + "epoch": 4.66499162479062, + "grad_norm": 0.8279513716697693, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5570 + }, + { + "epoch": 4.673366834170855, + "grad_norm": 0.8923851251602173, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5580 + }, + { + "epoch": 4.681742043551089, + "grad_norm": 0.7457540035247803, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5590 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7110715508460999, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 5600 + }, + { + "epoch": 4.698492462311558, + "grad_norm": 0.7135499119758606, + "learning_rate": 0.0002, + "loss": 1.4327, + "step": 5610 + }, + { + "epoch": 4.706867671691793, + "grad_norm": 0.7606837153434753, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 5620 + }, + { + "epoch": 4.715242881072027, + "grad_norm": 0.9622916579246521, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5630 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.7665684819221497, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 5640 + }, + { + "epoch": 4.731993299832496, + "grad_norm": 0.7985475659370422, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 5650 + }, + { + "epoch": 4.740368509212731, + "grad_norm": 0.9179279208183289, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 5660 + }, + { + "epoch": 4.748743718592965, + "grad_norm": 0.8311634063720703, + "learning_rate": 0.0002, + "loss": 1.4379, + "step": 5670 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.7773269414901733, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 5680 + }, + { + "epoch": 4.765494137353434, + "grad_norm": 0.7771748900413513, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5690 + }, + { + "epoch": 4.773869346733669, + "grad_norm": 0.7518507242202759, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 5700 + }, + { + "epoch": 4.782244556113903, + "grad_norm": 0.7699326276779175, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 5710 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7001115679740906, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5720 + }, + { + "epoch": 4.798994974874372, + "grad_norm": 0.7220682501792908, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 5730 + }, + { + "epoch": 4.807370184254607, + "grad_norm": 0.7654005289077759, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5740 + }, + { + "epoch": 4.815745393634841, + "grad_norm": 0.8132795095443726, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5750 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.7105404138565063, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5760 + }, + { + "epoch": 4.83249581239531, + "grad_norm": 0.9346209764480591, + "learning_rate": 0.0002, + "loss": 1.4289, + "step": 5770 + }, + { + "epoch": 4.840871021775545, + "grad_norm": 1.0075623989105225, + "learning_rate": 0.0002, + "loss": 1.4066, + "step": 5780 + }, + { + "epoch": 4.849246231155779, + "grad_norm": 0.758376955986023, + "learning_rate": 0.0002, + "loss": 1.4558, + "step": 5790 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.854821503162384, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 5800 + }, + { + "epoch": 4.865996649916248, + "grad_norm": 0.8226943016052246, + "learning_rate": 0.0002, + "loss": 1.4014, + "step": 5810 + }, + { + "epoch": 4.874371859296483, + "grad_norm": 0.7510473728179932, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 5820 + }, + { + "epoch": 4.882747068676717, + "grad_norm": 0.7449678182601929, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 5830 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.7840824723243713, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 5840 + }, + { + "epoch": 4.899497487437186, + "grad_norm": 0.8811169862747192, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5850 + }, + { + "epoch": 4.907872696817421, + "grad_norm": 0.84914630651474, + "learning_rate": 0.0002, + "loss": 1.3827, + "step": 5860 + }, + { + "epoch": 4.916247906197655, + "grad_norm": 0.7514461874961853, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 5870 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.7229002118110657, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5880 + }, + { + "epoch": 4.932998324958124, + "grad_norm": 0.9418245553970337, + "learning_rate": 0.0002, + "loss": 1.4302, + "step": 5890 + }, + { + "epoch": 4.941373534338359, + "grad_norm": 0.7626827359199524, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5900 + }, + { + "epoch": 4.949748743718593, + "grad_norm": 0.7711105346679688, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 5910 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8689648509025574, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 5920 + }, + { + "epoch": 4.966499162479062, + "grad_norm": 0.7873271107673645, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 5930 + }, + { + "epoch": 4.974874371859297, + "grad_norm": 0.7637495994567871, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5940 + }, + { + "epoch": 4.983249581239531, + "grad_norm": 0.9907955527305603, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 5950 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7827328443527222, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 5960 + }, + { + "epoch": 5.0, + "grad_norm": 0.818544328212738, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5970 + }, + { + "epoch": 5.0, + "eval_loss": 1.9436752796173096, + "eval_runtime": 38.087, + "eval_samples_per_second": 13.522, + "eval_steps_per_second": 1.707, + "step": 5970 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.7627823262859264e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-5970/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b9b5a7c3daecd4a9e34f9ad48cc515e2a1507123 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f545c65930e129451c7bd36fe44d012aae2447414bb532a37ca7abc620d7775 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..60ef97833c29f0a8b147569e38477df5103c908b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:253d86578f77f319e4495019367da1126161d355663002c435a03cfd60e5e490 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9554636e46e5522b1b34984e8dd336bd569b41d1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fa27651cbfd786c89957042d8a037ee5f55b3f5bcca720987db90428d3acab7 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..14ed377265eeef7f79011504a89eab0c19e3d605 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d557a94ba0ad3d4e2208806abe0193a81d0a9c1a41908ba6fd66c3024f5b07c0 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fb540429d779bec479514060e1ce206e1fe79424 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/trainer_state.json @@ -0,0 +1,5093 @@ +{ + "best_metric": 1.8061236143112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 7164, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.4256260097026825, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1200 + }, + { + "epoch": 1.0134003350083751, + "grad_norm": 0.35050156712532043, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 1.0217755443886096, + "grad_norm": 0.34773948788642883, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 1220 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.35487470030784607, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.37040361762046814, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1240 + }, + { + "epoch": 1.0469011725293131, + "grad_norm": 0.33740508556365967, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1250 + }, + { + "epoch": 1.0552763819095476, + "grad_norm": 0.3962724506855011, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 1260 + }, + { + "epoch": 1.0636515912897822, + "grad_norm": 0.3129824101924896, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1270 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.3620055019855499, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1280 + }, + { + "epoch": 1.0804020100502512, + "grad_norm": 0.3480982184410095, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 1290 + }, + { + "epoch": 1.0887772194304857, + "grad_norm": 0.344424843788147, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1300 + }, + { + "epoch": 1.0971524288107202, + "grad_norm": 0.3480122685432434, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1310 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.323662132024765, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1320 + }, + { + "epoch": 1.1139028475711892, + "grad_norm": 0.35440102219581604, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1330 + }, + { + "epoch": 1.1222780569514237, + "grad_norm": 0.3342263698577881, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 1340 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.35705259442329407, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.38021907210350037, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1360 + }, + { + "epoch": 1.1474036850921272, + "grad_norm": 0.34918731451034546, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1370 + }, + { + "epoch": 1.1557788944723617, + "grad_norm": 0.371868371963501, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 1380 + }, + { + "epoch": 1.1641541038525962, + "grad_norm": 0.38413912057876587, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1390 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3898005187511444, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1400 + }, + { + "epoch": 1.1809045226130652, + "grad_norm": 0.3726498484611511, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 1410 + }, + { + "epoch": 1.1892797319932997, + "grad_norm": 0.3532905876636505, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 1420 + }, + { + "epoch": 1.1976549413735342, + "grad_norm": 0.338127464056015, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1430 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.3472749888896942, + "learning_rate": 0.0002, + "loss": 1.871, + "step": 1440 + }, + { + "epoch": 1.2144053601340032, + "grad_norm": 0.3523476719856262, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1450 + }, + { + "epoch": 1.2227805695142377, + "grad_norm": 0.42986124753952026, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 1460 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 0.38195517659187317, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 1470 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.31665122509002686, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 1480 + }, + { + "epoch": 1.2479061976549413, + "grad_norm": 0.3539541959762573, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 1490 + }, + { + "epoch": 1.2562814070351758, + "grad_norm": 0.40162816643714905, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1500 + }, + { + "epoch": 1.2646566164154103, + "grad_norm": 0.34727150201797485, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 1510 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3364993929862976, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1520 + }, + { + "epoch": 1.2814070351758793, + "grad_norm": 0.323483943939209, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 1530 + }, + { + "epoch": 1.2897822445561138, + "grad_norm": 0.4114733934402466, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 1540 + }, + { + "epoch": 1.2981574539363483, + "grad_norm": 0.37476620078086853, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1550 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.4216269552707672, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1560 + }, + { + "epoch": 1.3149078726968173, + "grad_norm": 0.3204927444458008, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1570 + }, + { + "epoch": 1.3232830820770518, + "grad_norm": 0.36916354298591614, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1580 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.3755691647529602, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 1590 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3688889443874359, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1600 + }, + { + "epoch": 1.3484087102177553, + "grad_norm": 0.34306398034095764, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 1610 + }, + { + "epoch": 1.3567839195979898, + "grad_norm": 0.3651525676250458, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 1620 + }, + { + "epoch": 1.3651591289782243, + "grad_norm": 0.3461526036262512, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1630 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37959185242652893, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1640 + }, + { + "epoch": 1.3819095477386933, + "grad_norm": 0.4005356431007385, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1650 + }, + { + "epoch": 1.3902847571189278, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 1660 + }, + { + "epoch": 1.3986599664991624, + "grad_norm": 0.38220855593681335, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 1670 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1680 + }, + { + "epoch": 1.4154103852596314, + "grad_norm": 0.40028059482574463, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 1690 + }, + { + "epoch": 1.4237855946398659, + "grad_norm": 0.3953610360622406, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 1700 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.39524543285369873, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1710 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.37721359729766846, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 1720 + }, + { + "epoch": 1.4489112227805694, + "grad_norm": 0.4220093786716461, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 1730 + }, + { + "epoch": 1.457286432160804, + "grad_norm": 0.3876369595527649, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 1740 + }, + { + "epoch": 1.4656616415410384, + "grad_norm": 0.3774619400501251, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1750 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.3608052432537079, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 1760 + }, + { + "epoch": 1.4824120603015074, + "grad_norm": 0.32083916664123535, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 1770 + }, + { + "epoch": 1.490787269681742, + "grad_norm": 0.32290884852409363, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1780 + }, + { + "epoch": 1.4991624790619764, + "grad_norm": 0.3537974953651428, + "learning_rate": 0.0002, + "loss": 1.7648, + "step": 1790 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.36576104164123535, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 1800 + }, + { + "epoch": 1.5159128978224454, + "grad_norm": 0.3336752653121948, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1810 + }, + { + "epoch": 1.52428810720268, + "grad_norm": 0.3551652431488037, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1820 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.43313586711883545, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 1830 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.39160311222076416, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 1840 + }, + { + "epoch": 1.5494137353433834, + "grad_norm": 0.38758179545402527, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1850 + }, + { + "epoch": 1.557788944723618, + "grad_norm": 0.3658832013607025, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 1860 + }, + { + "epoch": 1.5661641541038525, + "grad_norm": 0.375372052192688, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1870 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3586942255496979, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1880 + }, + { + "epoch": 1.5829145728643215, + "grad_norm": 0.3626467287540436, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1890 + }, + { + "epoch": 1.591289782244556, + "grad_norm": 0.4199363589286804, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1900 + }, + { + "epoch": 1.5996649916247905, + "grad_norm": 0.35646331310272217, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 1910 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3465106189250946, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1920 + }, + { + "epoch": 1.6164154103852595, + "grad_norm": 0.43392884731292725, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 1930 + }, + { + "epoch": 1.624790619765494, + "grad_norm": 0.39187198877334595, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 1940 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.3685080409049988, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1950 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.4044491946697235, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 1960 + }, + { + "epoch": 1.6499162479061975, + "grad_norm": 0.4388049244880676, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 1970 + }, + { + "epoch": 1.658291457286432, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 1980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3501148521900177, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1990 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3751881718635559, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2000 + }, + { + "epoch": 1.6834170854271355, + "grad_norm": 0.3902788460254669, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 2010 + }, + { + "epoch": 1.69179229480737, + "grad_norm": 0.39642134308815, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 1.7001675041876045, + "grad_norm": 0.35721203684806824, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 2030 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.360419899225235, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2040 + }, + { + "epoch": 1.7169179229480735, + "grad_norm": 0.3755600154399872, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 2050 + }, + { + "epoch": 1.725293132328308, + "grad_norm": 0.3939184844493866, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 2060 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.33955490589141846, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 2070 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.35501939058303833, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2080 + }, + { + "epoch": 1.7504187604690116, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 2090 + }, + { + "epoch": 1.758793969849246, + "grad_norm": 0.3472785949707031, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 2100 + }, + { + "epoch": 1.7671691792294806, + "grad_norm": 0.3620430827140808, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2110 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 2120 + }, + { + "epoch": 1.7839195979899496, + "grad_norm": 0.3662523925304413, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 2130 + }, + { + "epoch": 1.792294807370184, + "grad_norm": 0.4113886058330536, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2140 + }, + { + "epoch": 1.8006700167504186, + "grad_norm": 0.3765672743320465, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2150 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.41623714566230774, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 2160 + }, + { + "epoch": 1.8174204355108876, + "grad_norm": 0.3724099099636078, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2170 + }, + { + "epoch": 1.8257956448911221, + "grad_norm": 0.3990779221057892, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 2180 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.3677702844142914, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 2190 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.3944959342479706, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 2200 + }, + { + "epoch": 1.8509212730318256, + "grad_norm": 0.3413957357406616, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2210 + }, + { + "epoch": 1.8592964824120601, + "grad_norm": 0.40136098861694336, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 2220 + }, + { + "epoch": 1.8676716917922946, + "grad_norm": 0.3496319055557251, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 2230 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3759860694408417, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2240 + }, + { + "epoch": 1.8844221105527639, + "grad_norm": 0.43556007742881775, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 2250 + }, + { + "epoch": 1.8927973199329984, + "grad_norm": 0.3864828944206238, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 2260 + }, + { + "epoch": 1.9011725293132329, + "grad_norm": 0.396930456161499, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 2270 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37667879462242126, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 2280 + }, + { + "epoch": 1.917922948073702, + "grad_norm": 0.3539164066314697, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 2290 + }, + { + "epoch": 1.9262981574539364, + "grad_norm": 0.40542101860046387, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 2300 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.37341606616973877, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 2310 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.4011504352092743, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.95142378559464, + "grad_norm": 0.37934592366218567, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 2330 + }, + { + "epoch": 1.9597989949748744, + "grad_norm": 0.32745009660720825, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 2340 + }, + { + "epoch": 1.968174204355109, + "grad_norm": 0.38347750902175903, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2350 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3945120871067047, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 2360 + }, + { + "epoch": 1.984924623115578, + "grad_norm": 0.4034058749675751, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 2370 + }, + { + "epoch": 1.9932998324958124, + "grad_norm": 0.3546718955039978, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.8061236143112183, + "eval_runtime": 38.2113, + "eval_samples_per_second": 13.478, + "eval_steps_per_second": 1.701, + "step": 2388 + }, + { + "epoch": 2.0016750418760467, + "grad_norm": 0.35184019804000854, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 2390 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.40416669845581055, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2400 + }, + { + "epoch": 2.0184254606365157, + "grad_norm": 0.3824569880962372, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2410 + }, + { + "epoch": 2.0268006700167502, + "grad_norm": 0.42036163806915283, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2420 + }, + { + "epoch": 2.0351758793969847, + "grad_norm": 0.40417996048927307, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2430 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.45298922061920166, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 2440 + }, + { + "epoch": 2.0519262981574538, + "grad_norm": 0.48289841413497925, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 2450 + }, + { + "epoch": 2.0603015075376883, + "grad_norm": 0.43702399730682373, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 2460 + }, + { + "epoch": 2.0686767169179228, + "grad_norm": 0.49487054347991943, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2470 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.40030500292778015, + "learning_rate": 0.0002, + "loss": 1.6552, + "step": 2480 + }, + { + "epoch": 2.0854271356783918, + "grad_norm": 0.4664880037307739, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 2490 + }, + { + "epoch": 2.0938023450586263, + "grad_norm": 0.4111400842666626, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 2500 + }, + { + "epoch": 2.102177554438861, + "grad_norm": 0.4155750572681427, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 2510 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.39257505536079407, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2520 + }, + { + "epoch": 2.11892797319933, + "grad_norm": 0.4156777560710907, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 2530 + }, + { + "epoch": 2.1273031825795643, + "grad_norm": 0.4025181233882904, + "learning_rate": 0.0002, + "loss": 1.6695, + "step": 2540 + }, + { + "epoch": 2.135678391959799, + "grad_norm": 0.42347562313079834, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2550 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.47068294882774353, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2560 + }, + { + "epoch": 2.152428810720268, + "grad_norm": 0.44081777334213257, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 2570 + }, + { + "epoch": 2.1608040201005023, + "grad_norm": 0.44823798537254333, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2580 + }, + { + "epoch": 2.169179229480737, + "grad_norm": 0.40486326813697815, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2590 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.454236775636673, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2600 + }, + { + "epoch": 2.185929648241206, + "grad_norm": 0.42555344104766846, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2610 + }, + { + "epoch": 2.1943048576214403, + "grad_norm": 0.5607381463050842, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2620 + }, + { + "epoch": 2.202680067001675, + "grad_norm": 0.4095611870288849, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 2630 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.419342577457428, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2640 + }, + { + "epoch": 2.219430485762144, + "grad_norm": 0.48541849851608276, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2650 + }, + { + "epoch": 2.2278056951423784, + "grad_norm": 0.4365246891975403, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 2660 + }, + { + "epoch": 2.236180904522613, + "grad_norm": 0.46417000889778137, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 2670 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5034580230712891, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 2680 + }, + { + "epoch": 2.2529313232830823, + "grad_norm": 0.44852879643440247, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2690 + }, + { + "epoch": 2.2613065326633164, + "grad_norm": 0.43886998295783997, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 2700 + }, + { + "epoch": 2.2696817420435513, + "grad_norm": 0.45762625336647034, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 2710 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.39429017901420593, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2720 + }, + { + "epoch": 2.2864321608040203, + "grad_norm": 0.4420442581176758, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2730 + }, + { + "epoch": 2.2948073701842544, + "grad_norm": 0.4327794015407562, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 2740 + }, + { + "epoch": 2.3031825795644894, + "grad_norm": 0.4303780198097229, + "learning_rate": 0.0002, + "loss": 1.6405, + "step": 2750 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.41379377245903015, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 2760 + }, + { + "epoch": 2.3199329983249584, + "grad_norm": 0.4821205735206604, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 2770 + }, + { + "epoch": 2.3283082077051924, + "grad_norm": 0.46232181787490845, + "learning_rate": 0.0002, + "loss": 1.6694, + "step": 2780 + }, + { + "epoch": 2.3366834170854274, + "grad_norm": 0.44937554001808167, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 2790 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.443250447511673, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2800 + }, + { + "epoch": 2.3534338358458964, + "grad_norm": 0.4687805473804474, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2810 + }, + { + "epoch": 2.3618090452261304, + "grad_norm": 0.435031920671463, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2820 + }, + { + "epoch": 2.3701842546063654, + "grad_norm": 0.4949858784675598, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 2830 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.46349018812179565, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 2840 + }, + { + "epoch": 2.3869346733668344, + "grad_norm": 0.46377238631248474, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2850 + }, + { + "epoch": 2.3953098827470685, + "grad_norm": 0.6111940741539001, + "learning_rate": 0.0002, + "loss": 1.5384, + "step": 2860 + }, + { + "epoch": 2.4036850921273034, + "grad_norm": 0.45090532302856445, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2870 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.4762120842933655, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2880 + }, + { + "epoch": 2.4204355108877724, + "grad_norm": 0.4397919774055481, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 2890 + }, + { + "epoch": 2.4288107202680065, + "grad_norm": 0.4765152335166931, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2900 + }, + { + "epoch": 2.4371859296482414, + "grad_norm": 0.4347304403781891, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2910 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.3918324410915375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 2920 + }, + { + "epoch": 2.4539363484087104, + "grad_norm": 0.43932855129241943, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 2930 + }, + { + "epoch": 2.4623115577889445, + "grad_norm": 0.46946918964385986, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 2940 + }, + { + "epoch": 2.4706867671691795, + "grad_norm": 0.45169174671173096, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2950 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.43488186597824097, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2960 + }, + { + "epoch": 2.4874371859296485, + "grad_norm": 0.42297765612602234, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2970 + }, + { + "epoch": 2.4958123953098825, + "grad_norm": 0.4546392560005188, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2980 + }, + { + "epoch": 2.5041876046901175, + "grad_norm": 0.4236692488193512, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 2990 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.46421024203300476, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 3000 + }, + { + "epoch": 2.5209380234505865, + "grad_norm": 0.5040220618247986, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 3010 + }, + { + "epoch": 2.5293132328308205, + "grad_norm": 0.4596138894557953, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.5376884422110555, + "grad_norm": 0.4410228729248047, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3030 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.553693413734436, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3040 + }, + { + "epoch": 2.5544388609715245, + "grad_norm": 0.41298043727874756, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 3050 + }, + { + "epoch": 2.5628140703517586, + "grad_norm": 0.4894513487815857, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 3060 + }, + { + "epoch": 2.5711892797319935, + "grad_norm": 0.5525603294372559, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3070 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.5043630003929138, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3080 + }, + { + "epoch": 2.5879396984924625, + "grad_norm": 0.4690920412540436, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 3090 + }, + { + "epoch": 2.5963149078726966, + "grad_norm": 0.4358677566051483, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 3100 + }, + { + "epoch": 2.6046901172529315, + "grad_norm": 0.4621894061565399, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 3110 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.4639507532119751, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 3120 + }, + { + "epoch": 2.6214405360134005, + "grad_norm": 0.45161309838294983, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 3130 + }, + { + "epoch": 2.6298157453936346, + "grad_norm": 0.49179261922836304, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 3140 + }, + { + "epoch": 2.6381909547738696, + "grad_norm": 0.4739720821380615, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 3150 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.468252956867218, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3160 + }, + { + "epoch": 2.6549413735343386, + "grad_norm": 0.44691553711891174, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 3170 + }, + { + "epoch": 2.6633165829145726, + "grad_norm": 0.47537046670913696, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 3180 + }, + { + "epoch": 2.6716917922948076, + "grad_norm": 0.4445202052593231, + "learning_rate": 0.0002, + "loss": 1.6755, + "step": 3190 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.46785518527030945, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3200 + }, + { + "epoch": 2.6884422110552766, + "grad_norm": 0.4807088077068329, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 3210 + }, + { + "epoch": 2.6968174204355106, + "grad_norm": 0.4547516703605652, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 3220 + }, + { + "epoch": 2.7051926298157456, + "grad_norm": 0.5200821161270142, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 3230 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.4915551245212555, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3240 + }, + { + "epoch": 2.7219430485762146, + "grad_norm": 0.4324817955493927, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3250 + }, + { + "epoch": 2.7303182579564487, + "grad_norm": 0.6290464997291565, + "learning_rate": 0.0002, + "loss": 1.6154, + "step": 3260 + }, + { + "epoch": 2.7386934673366836, + "grad_norm": 0.42255541682243347, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 3270 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.47089505195617676, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 3280 + }, + { + "epoch": 2.7554438860971526, + "grad_norm": 0.4492960572242737, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3290 + }, + { + "epoch": 2.7638190954773867, + "grad_norm": 0.4711938202381134, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3300 + }, + { + "epoch": 2.7721943048576216, + "grad_norm": 0.4635316729545593, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 3310 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.4207742512226105, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3320 + }, + { + "epoch": 2.7889447236180906, + "grad_norm": 0.5545504093170166, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3330 + }, + { + "epoch": 2.7973199329983247, + "grad_norm": 0.46976953744888306, + "learning_rate": 0.0002, + "loss": 1.6642, + "step": 3340 + }, + { + "epoch": 2.8056951423785597, + "grad_norm": 0.4805937111377716, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 3350 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.4986467659473419, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 3360 + }, + { + "epoch": 2.8224455611390287, + "grad_norm": 0.44702932238578796, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 3370 + }, + { + "epoch": 2.8308207705192627, + "grad_norm": 0.4698854088783264, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 3380 + }, + { + "epoch": 2.8391959798994977, + "grad_norm": 0.5756528377532959, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 3390 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.4266531765460968, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 3400 + }, + { + "epoch": 2.8559463986599667, + "grad_norm": 0.5342442989349365, + "learning_rate": 0.0002, + "loss": 1.6351, + "step": 3410 + }, + { + "epoch": 2.8643216080402008, + "grad_norm": 0.47210443019866943, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 3420 + }, + { + "epoch": 2.8726968174204357, + "grad_norm": 0.4491795599460602, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3430 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5387647151947021, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 3440 + }, + { + "epoch": 2.8894472361809047, + "grad_norm": 0.5059208273887634, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3450 + }, + { + "epoch": 2.8978224455611388, + "grad_norm": 0.472605437040329, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3460 + }, + { + "epoch": 2.9061976549413737, + "grad_norm": 0.499795138835907, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3470 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.4887969493865967, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 3480 + }, + { + "epoch": 2.9229480737018427, + "grad_norm": 0.4670022130012512, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.931323283082077, + "grad_norm": 0.4475444555282593, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 3500 + }, + { + "epoch": 2.9396984924623117, + "grad_norm": 0.39244669675827026, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 3510 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4905056059360504, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3520 + }, + { + "epoch": 2.9564489112227808, + "grad_norm": 0.4395551085472107, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 3530 + }, + { + "epoch": 2.964824120603015, + "grad_norm": 0.4693661034107208, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3540 + }, + { + "epoch": 2.9731993299832498, + "grad_norm": 0.473781943321228, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 3550 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4374050796031952, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3560 + }, + { + "epoch": 2.9899497487437188, + "grad_norm": 0.46144190430641174, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 3570 + }, + { + "epoch": 2.998324958123953, + "grad_norm": 0.43887680768966675, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 3580 + }, + { + "epoch": 3.0, + "eval_loss": 1.8283122777938843, + "eval_runtime": 38.023, + "eval_samples_per_second": 13.544, + "eval_steps_per_second": 1.709, + "step": 3582 + }, + { + "epoch": 3.006700167504188, + "grad_norm": 0.6784713268280029, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3590 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5783940553665161, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 3600 + }, + { + "epoch": 3.023450586264657, + "grad_norm": 0.5408937335014343, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 3610 + }, + { + "epoch": 3.0318257956448913, + "grad_norm": 0.5229013562202454, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3620 + }, + { + "epoch": 3.040201005025126, + "grad_norm": 0.49160143733024597, + "learning_rate": 0.0002, + "loss": 1.4835, + "step": 3630 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.6563201546669006, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 3640 + }, + { + "epoch": 3.056951423785595, + "grad_norm": 0.5686020851135254, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3650 + }, + { + "epoch": 3.0653266331658293, + "grad_norm": 0.5774043202400208, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3660 + }, + { + "epoch": 3.073701842546064, + "grad_norm": 0.6106171011924744, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 3670 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.517433226108551, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3680 + }, + { + "epoch": 3.090452261306533, + "grad_norm": 0.5681702494621277, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3690 + }, + { + "epoch": 3.0988274706867673, + "grad_norm": 0.5769233107566833, + "learning_rate": 0.0002, + "loss": 1.4731, + "step": 3700 + }, + { + "epoch": 3.107202680067002, + "grad_norm": 0.5657462477684021, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 3710 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.6035246253013611, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3720 + }, + { + "epoch": 3.123953098827471, + "grad_norm": 0.7286643385887146, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 3730 + }, + { + "epoch": 3.1323283082077054, + "grad_norm": 0.5121201872825623, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 3740 + }, + { + "epoch": 3.14070351758794, + "grad_norm": 0.5074213147163391, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 3750 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.57481849193573, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 3760 + }, + { + "epoch": 3.157453936348409, + "grad_norm": 0.6326663494110107, + "learning_rate": 0.0002, + "loss": 1.4765, + "step": 3770 + }, + { + "epoch": 3.1658291457286434, + "grad_norm": 0.6039315462112427, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3780 + }, + { + "epoch": 3.174204355108878, + "grad_norm": 0.6936715245246887, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 3790 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.6516796946525574, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3800 + }, + { + "epoch": 3.190954773869347, + "grad_norm": 0.6140730977058411, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3810 + }, + { + "epoch": 3.1993299832495814, + "grad_norm": 0.631328284740448, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3820 + }, + { + "epoch": 3.207705192629816, + "grad_norm": 0.6265402436256409, + "learning_rate": 0.0002, + "loss": 1.4844, + "step": 3830 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6649428606033325, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3840 + }, + { + "epoch": 3.224455611390285, + "grad_norm": 0.5329259634017944, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 3850 + }, + { + "epoch": 3.2328308207705194, + "grad_norm": 0.6008304953575134, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 3860 + }, + { + "epoch": 3.241206030150754, + "grad_norm": 0.5918582081794739, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 3870 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.643622100353241, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3880 + }, + { + "epoch": 3.257956448911223, + "grad_norm": 0.5517964363098145, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 3890 + }, + { + "epoch": 3.2663316582914574, + "grad_norm": 0.6780755519866943, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 3900 + }, + { + "epoch": 3.274706867671692, + "grad_norm": 0.6742202639579773, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3910 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.6228749752044678, + "learning_rate": 0.0002, + "loss": 1.5279, + "step": 3920 + }, + { + "epoch": 3.291457286432161, + "grad_norm": 0.5836303234100342, + "learning_rate": 0.0002, + "loss": 1.4899, + "step": 3930 + }, + { + "epoch": 3.2998324958123955, + "grad_norm": 0.6337724328041077, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3940 + }, + { + "epoch": 3.30820770519263, + "grad_norm": 0.6345084309577942, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 3950 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.6125303506851196, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3960 + }, + { + "epoch": 3.324958123953099, + "grad_norm": 0.6259911060333252, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 3970 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.645745575428009, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3980 + }, + { + "epoch": 3.341708542713568, + "grad_norm": 0.6666176915168762, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 3990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.59013831615448, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 4000 + }, + { + "epoch": 3.358458961474037, + "grad_norm": 0.6604634523391724, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 4010 + }, + { + "epoch": 3.3668341708542715, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 4020 + }, + { + "epoch": 3.375209380234506, + "grad_norm": 0.515724778175354, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 4030 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.681968092918396, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4040 + }, + { + "epoch": 3.391959798994975, + "grad_norm": 0.5978158116340637, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4050 + }, + { + "epoch": 3.4003350083752095, + "grad_norm": 0.6043432354927063, + "learning_rate": 0.0002, + "loss": 1.5449, + "step": 4060 + }, + { + "epoch": 3.408710217755444, + "grad_norm": 0.5899770855903625, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4070 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6014242172241211, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 4080 + }, + { + "epoch": 3.425460636515913, + "grad_norm": 0.5944811105728149, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 4090 + }, + { + "epoch": 3.4338358458961475, + "grad_norm": 0.6506822109222412, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 4100 + }, + { + "epoch": 3.442211055276382, + "grad_norm": 0.6926528811454773, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 4110 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.5646378993988037, + "learning_rate": 0.0002, + "loss": 1.5169, + "step": 4120 + }, + { + "epoch": 3.458961474036851, + "grad_norm": 0.7233654856681824, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 4130 + }, + { + "epoch": 3.4673366834170856, + "grad_norm": 0.6231815814971924, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4140 + }, + { + "epoch": 3.47571189279732, + "grad_norm": 0.6115689873695374, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 4150 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5812674760818481, + "learning_rate": 0.0002, + "loss": 1.4621, + "step": 4160 + }, + { + "epoch": 3.492462311557789, + "grad_norm": 0.6099632978439331, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 4170 + }, + { + "epoch": 3.5008375209380236, + "grad_norm": 0.6102647185325623, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 4180 + }, + { + "epoch": 3.509212730318258, + "grad_norm": 0.6034680008888245, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4190 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.6281666159629822, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 4200 + }, + { + "epoch": 3.525963149078727, + "grad_norm": 0.6245372295379639, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 4210 + }, + { + "epoch": 3.5343383584589616, + "grad_norm": 0.5897293090820312, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 4220 + }, + { + "epoch": 3.542713567839196, + "grad_norm": 0.601054847240448, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4230 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.7004473805427551, + "learning_rate": 0.0002, + "loss": 1.4974, + "step": 4240 + }, + { + "epoch": 3.559463986599665, + "grad_norm": 0.6601553559303284, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 4250 + }, + { + "epoch": 3.5678391959798996, + "grad_norm": 0.6112467050552368, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4260 + }, + { + "epoch": 3.576214405360134, + "grad_norm": 0.5902454853057861, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4270 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5792450904846191, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4280 + }, + { + "epoch": 3.592964824120603, + "grad_norm": 0.5923888087272644, + "learning_rate": 0.0002, + "loss": 1.4664, + "step": 4290 + }, + { + "epoch": 3.6013400335008376, + "grad_norm": 0.5869482159614563, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 4300 + }, + { + "epoch": 3.609715242881072, + "grad_norm": 0.6372929811477661, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 4310 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.6350686550140381, + "learning_rate": 0.0002, + "loss": 1.4977, + "step": 4320 + }, + { + "epoch": 3.626465661641541, + "grad_norm": 0.571819007396698, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4330 + }, + { + "epoch": 3.6348408710217757, + "grad_norm": 0.592250645160675, + "learning_rate": 0.0002, + "loss": 1.5414, + "step": 4340 + }, + { + "epoch": 3.64321608040201, + "grad_norm": 0.6110650897026062, + "learning_rate": 0.0002, + "loss": 1.4912, + "step": 4350 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.6187081336975098, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 4360 + }, + { + "epoch": 3.659966499162479, + "grad_norm": 0.6197671890258789, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 4370 + }, + { + "epoch": 3.6683417085427137, + "grad_norm": 0.6050862669944763, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4380 + }, + { + "epoch": 3.676716917922948, + "grad_norm": 0.621265172958374, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 4390 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.6552940011024475, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 4400 + }, + { + "epoch": 3.693467336683417, + "grad_norm": 0.5638861060142517, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 4410 + }, + { + "epoch": 3.7018425460636517, + "grad_norm": 0.6388863325119019, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4420 + }, + { + "epoch": 3.710217755443886, + "grad_norm": 0.6062559485435486, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 4430 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5800350308418274, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 4440 + }, + { + "epoch": 3.726968174204355, + "grad_norm": 0.5954474210739136, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 4450 + }, + { + "epoch": 3.7353433835845897, + "grad_norm": 0.5880125761032104, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 4460 + }, + { + "epoch": 3.7437185929648242, + "grad_norm": 0.5880921483039856, + "learning_rate": 0.0002, + "loss": 1.5055, + "step": 4470 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.5995073914527893, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 4480 + }, + { + "epoch": 3.7604690117252932, + "grad_norm": 0.5958493947982788, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 4490 + }, + { + "epoch": 3.7688442211055277, + "grad_norm": 0.5694711804389954, + "learning_rate": 0.0002, + "loss": 1.5472, + "step": 4500 + }, + { + "epoch": 3.7772194304857623, + "grad_norm": 0.6175141930580139, + "learning_rate": 0.0002, + "loss": 1.5105, + "step": 4510 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.5541581511497498, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 4520 + }, + { + "epoch": 3.7939698492462313, + "grad_norm": 0.5986164808273315, + "learning_rate": 0.0002, + "loss": 1.5283, + "step": 4530 + }, + { + "epoch": 3.8023450586264658, + "grad_norm": 0.640072226524353, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4540 + }, + { + "epoch": 3.8107202680067003, + "grad_norm": 0.5742579698562622, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4550 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.6658656001091003, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4560 + }, + { + "epoch": 3.8274706867671693, + "grad_norm": 0.5475369691848755, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 4570 + }, + { + "epoch": 3.835845896147404, + "grad_norm": 0.613172173500061, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 4580 + }, + { + "epoch": 3.8442211055276383, + "grad_norm": 0.590968132019043, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4590 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.5865461826324463, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4600 + }, + { + "epoch": 3.8609715242881073, + "grad_norm": 0.6815178990364075, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4610 + }, + { + "epoch": 3.869346733668342, + "grad_norm": 0.6551400423049927, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4620 + }, + { + "epoch": 3.8777219430485763, + "grad_norm": 0.6398897171020508, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 4630 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6761762499809265, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 4640 + }, + { + "epoch": 3.8944723618090453, + "grad_norm": 0.6277294754981995, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 4650 + }, + { + "epoch": 3.90284757118928, + "grad_norm": 0.6285301446914673, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 4660 + }, + { + "epoch": 3.9112227805695143, + "grad_norm": 0.5416069626808167, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4670 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.6314545273780823, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4680 + }, + { + "epoch": 3.9279731993299833, + "grad_norm": 0.604479968547821, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 4690 + }, + { + "epoch": 3.936348408710218, + "grad_norm": 0.5321660041809082, + "learning_rate": 0.0002, + "loss": 1.5186, + "step": 4700 + }, + { + "epoch": 3.9447236180904524, + "grad_norm": 0.6632516980171204, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 4710 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.5925896763801575, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 4720 + }, + { + "epoch": 3.9614740368509214, + "grad_norm": 0.6580308675765991, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 4730 + }, + { + "epoch": 3.969849246231156, + "grad_norm": 0.5578170418739319, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 4740 + }, + { + "epoch": 3.9782244556113904, + "grad_norm": 0.6216608285903931, + "learning_rate": 0.0002, + "loss": 1.5394, + "step": 4750 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.5693069696426392, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 4760 + }, + { + "epoch": 3.9949748743718594, + "grad_norm": 0.5353434681892395, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 4770 + }, + { + "epoch": 4.0, + "eval_loss": 1.8809821605682373, + "eval_runtime": 37.9695, + "eval_samples_per_second": 13.564, + "eval_steps_per_second": 1.712, + "step": 4776 + }, + { + "epoch": 4.0033500837520934, + "grad_norm": 0.6117817759513855, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4780 + }, + { + "epoch": 4.011725293132328, + "grad_norm": 0.6816073656082153, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 4790 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.715548038482666, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 4800 + }, + { + "epoch": 4.028475711892797, + "grad_norm": 0.8585814833641052, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 4810 + }, + { + "epoch": 4.0368509212730315, + "grad_norm": 0.7372158765792847, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 4820 + }, + { + "epoch": 4.045226130653266, + "grad_norm": 0.8915117979049683, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4830 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.9323588013648987, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 4840 + }, + { + "epoch": 4.061976549413735, + "grad_norm": 0.9298437237739563, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4850 + }, + { + "epoch": 4.0703517587939695, + "grad_norm": 0.8541792035102844, + "learning_rate": 0.0002, + "loss": 1.4173, + "step": 4860 + }, + { + "epoch": 4.078726968174204, + "grad_norm": 0.7833571434020996, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 4870 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.9325295090675354, + "learning_rate": 0.0002, + "loss": 1.3835, + "step": 4880 + }, + { + "epoch": 4.0954773869346734, + "grad_norm": 0.7066370248794556, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 4890 + }, + { + "epoch": 4.1038525963149075, + "grad_norm": 0.712640643119812, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 4900 + }, + { + "epoch": 4.1122278056951425, + "grad_norm": 0.6970218420028687, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4910 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.7979312539100647, + "learning_rate": 0.0002, + "loss": 1.3805, + "step": 4920 + }, + { + "epoch": 4.1289782244556115, + "grad_norm": 0.7801558375358582, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4930 + }, + { + "epoch": 4.1373534338358455, + "grad_norm": 0.7505159974098206, + "learning_rate": 0.0002, + "loss": 1.3288, + "step": 4940 + }, + { + "epoch": 4.1457286432160805, + "grad_norm": 0.738201916217804, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 4950 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7736659049987793, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4960 + }, + { + "epoch": 4.1624790619765495, + "grad_norm": 0.7850064635276794, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 4970 + }, + { + "epoch": 4.1708542713567835, + "grad_norm": 0.8316620588302612, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 4980 + }, + { + "epoch": 4.1792294807370185, + "grad_norm": 0.7217330932617188, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 4990 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.7050199508666992, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 5000 + }, + { + "epoch": 4.1959798994974875, + "grad_norm": 0.6992659568786621, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 5010 + }, + { + "epoch": 4.204355108877722, + "grad_norm": 0.7648445963859558, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 5020 + }, + { + "epoch": 4.2127303182579565, + "grad_norm": 0.8093137741088867, + "learning_rate": 0.0002, + "loss": 1.3339, + "step": 5030 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.6907750368118286, + "learning_rate": 0.0002, + "loss": 1.37, + "step": 5040 + }, + { + "epoch": 4.2294807370184255, + "grad_norm": 0.7000078558921814, + "learning_rate": 0.0002, + "loss": 1.4231, + "step": 5050 + }, + { + "epoch": 4.23785594639866, + "grad_norm": 0.715034008026123, + "learning_rate": 0.0002, + "loss": 1.3411, + "step": 5060 + }, + { + "epoch": 4.2462311557788945, + "grad_norm": 0.828895628452301, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5070 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7127292156219482, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 5080 + }, + { + "epoch": 4.2629815745393635, + "grad_norm": 0.8256623148918152, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 5090 + }, + { + "epoch": 4.271356783919598, + "grad_norm": 0.8062452077865601, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5100 + }, + { + "epoch": 4.279731993299833, + "grad_norm": 0.6861081123352051, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5110 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 0.7566041350364685, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 5120 + }, + { + "epoch": 4.296482412060302, + "grad_norm": 0.8734753727912903, + "learning_rate": 0.0002, + "loss": 1.4571, + "step": 5130 + }, + { + "epoch": 4.304857621440536, + "grad_norm": 0.8559320569038391, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5140 + }, + { + "epoch": 4.313232830820771, + "grad_norm": 0.6965576410293579, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 5150 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.8277813792228699, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 5160 + }, + { + "epoch": 4.32998324958124, + "grad_norm": 1.0733633041381836, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5170 + }, + { + "epoch": 4.338358458961474, + "grad_norm": 0.7914809584617615, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5180 + }, + { + "epoch": 4.346733668341709, + "grad_norm": 0.8307849168777466, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 5190 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.7066516280174255, + "learning_rate": 0.0002, + "loss": 1.4318, + "step": 5200 + }, + { + "epoch": 4.363484087102178, + "grad_norm": 0.9676792025566101, + "learning_rate": 0.0002, + "loss": 1.3866, + "step": 5210 + }, + { + "epoch": 4.371859296482412, + "grad_norm": 0.7672301530838013, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5220 + }, + { + "epoch": 4.380234505862647, + "grad_norm": 0.6888260245323181, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5230 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.8775295615196228, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 5240 + }, + { + "epoch": 4.396984924623116, + "grad_norm": 0.8742642998695374, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 5250 + }, + { + "epoch": 4.40536013400335, + "grad_norm": 0.6935433745384216, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 5260 + }, + { + "epoch": 4.413735343383585, + "grad_norm": 0.7726178169250488, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5270 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.7493860721588135, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5280 + }, + { + "epoch": 4.430485762144054, + "grad_norm": 0.7758517265319824, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 5290 + }, + { + "epoch": 4.438860971524288, + "grad_norm": 0.779315173625946, + "learning_rate": 0.0002, + "loss": 1.2916, + "step": 5300 + }, + { + "epoch": 4.447236180904523, + "grad_norm": 0.7753667235374451, + "learning_rate": 0.0002, + "loss": 1.4483, + "step": 5310 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.8738188743591309, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 5320 + }, + { + "epoch": 4.463986599664992, + "grad_norm": 0.8410757184028625, + "learning_rate": 0.0002, + "loss": 1.41, + "step": 5330 + }, + { + "epoch": 4.472361809045226, + "grad_norm": 0.728897750377655, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 5340 + }, + { + "epoch": 4.480737018425461, + "grad_norm": 0.7880531549453735, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 5350 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.8455142378807068, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 5360 + }, + { + "epoch": 4.49748743718593, + "grad_norm": 0.8527868986129761, + "learning_rate": 0.0002, + "loss": 1.431, + "step": 5370 + }, + { + "epoch": 4.505862646566165, + "grad_norm": 0.7743009328842163, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5380 + }, + { + "epoch": 4.514237855946399, + "grad_norm": 0.7555320858955383, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 5390 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.8146619200706482, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5400 + }, + { + "epoch": 4.530988274706868, + "grad_norm": 0.8042502999305725, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5410 + }, + { + "epoch": 4.539363484087103, + "grad_norm": 0.7329140305519104, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5420 + }, + { + "epoch": 4.547738693467337, + "grad_norm": 0.7574753165245056, + "learning_rate": 0.0002, + "loss": 1.3946, + "step": 5430 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 1.1223409175872803, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 5440 + }, + { + "epoch": 4.564489112227806, + "grad_norm": 0.7647369503974915, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5450 + }, + { + "epoch": 4.572864321608041, + "grad_norm": 0.9135531187057495, + "learning_rate": 0.0002, + "loss": 1.4569, + "step": 5460 + }, + { + "epoch": 4.581239530988275, + "grad_norm": 0.9343693852424622, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5470 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.869945764541626, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 5480 + }, + { + "epoch": 4.597989949748744, + "grad_norm": 0.7383785843849182, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5490 + }, + { + "epoch": 4.606365159128979, + "grad_norm": 0.7988699674606323, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5500 + }, + { + "epoch": 4.614740368509213, + "grad_norm": 0.8731256127357483, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 5510 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.7577664256095886, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5520 + }, + { + "epoch": 4.631490787269682, + "grad_norm": 0.7825039625167847, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5530 + }, + { + "epoch": 4.639865996649917, + "grad_norm": 0.8534902930259705, + "learning_rate": 0.0002, + "loss": 1.4594, + "step": 5540 + }, + { + "epoch": 4.648241206030151, + "grad_norm": 0.7403318285942078, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 5550 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.8229990005493164, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5560 + }, + { + "epoch": 4.66499162479062, + "grad_norm": 0.8279513716697693, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5570 + }, + { + "epoch": 4.673366834170855, + "grad_norm": 0.8923851251602173, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5580 + }, + { + "epoch": 4.681742043551089, + "grad_norm": 0.7457540035247803, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5590 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7110715508460999, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 5600 + }, + { + "epoch": 4.698492462311558, + "grad_norm": 0.7135499119758606, + "learning_rate": 0.0002, + "loss": 1.4327, + "step": 5610 + }, + { + "epoch": 4.706867671691793, + "grad_norm": 0.7606837153434753, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 5620 + }, + { + "epoch": 4.715242881072027, + "grad_norm": 0.9622916579246521, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5630 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.7665684819221497, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 5640 + }, + { + "epoch": 4.731993299832496, + "grad_norm": 0.7985475659370422, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 5650 + }, + { + "epoch": 4.740368509212731, + "grad_norm": 0.9179279208183289, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 5660 + }, + { + "epoch": 4.748743718592965, + "grad_norm": 0.8311634063720703, + "learning_rate": 0.0002, + "loss": 1.4379, + "step": 5670 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.7773269414901733, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 5680 + }, + { + "epoch": 4.765494137353434, + "grad_norm": 0.7771748900413513, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5690 + }, + { + "epoch": 4.773869346733669, + "grad_norm": 0.7518507242202759, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 5700 + }, + { + "epoch": 4.782244556113903, + "grad_norm": 0.7699326276779175, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 5710 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7001115679740906, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5720 + }, + { + "epoch": 4.798994974874372, + "grad_norm": 0.7220682501792908, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 5730 + }, + { + "epoch": 4.807370184254607, + "grad_norm": 0.7654005289077759, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5740 + }, + { + "epoch": 4.815745393634841, + "grad_norm": 0.8132795095443726, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5750 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.7105404138565063, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5760 + }, + { + "epoch": 4.83249581239531, + "grad_norm": 0.9346209764480591, + "learning_rate": 0.0002, + "loss": 1.4289, + "step": 5770 + }, + { + "epoch": 4.840871021775545, + "grad_norm": 1.0075623989105225, + "learning_rate": 0.0002, + "loss": 1.4066, + "step": 5780 + }, + { + "epoch": 4.849246231155779, + "grad_norm": 0.758376955986023, + "learning_rate": 0.0002, + "loss": 1.4558, + "step": 5790 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.854821503162384, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 5800 + }, + { + "epoch": 4.865996649916248, + "grad_norm": 0.8226943016052246, + "learning_rate": 0.0002, + "loss": 1.4014, + "step": 5810 + }, + { + "epoch": 4.874371859296483, + "grad_norm": 0.7510473728179932, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 5820 + }, + { + "epoch": 4.882747068676717, + "grad_norm": 0.7449678182601929, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 5830 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.7840824723243713, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 5840 + }, + { + "epoch": 4.899497487437186, + "grad_norm": 0.8811169862747192, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5850 + }, + { + "epoch": 4.907872696817421, + "grad_norm": 0.84914630651474, + "learning_rate": 0.0002, + "loss": 1.3827, + "step": 5860 + }, + { + "epoch": 4.916247906197655, + "grad_norm": 0.7514461874961853, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 5870 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.7229002118110657, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5880 + }, + { + "epoch": 4.932998324958124, + "grad_norm": 0.9418245553970337, + "learning_rate": 0.0002, + "loss": 1.4302, + "step": 5890 + }, + { + "epoch": 4.941373534338359, + "grad_norm": 0.7626827359199524, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5900 + }, + { + "epoch": 4.949748743718593, + "grad_norm": 0.7711105346679688, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 5910 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8689648509025574, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 5920 + }, + { + "epoch": 4.966499162479062, + "grad_norm": 0.7873271107673645, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 5930 + }, + { + "epoch": 4.974874371859297, + "grad_norm": 0.7637495994567871, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5940 + }, + { + "epoch": 4.983249581239531, + "grad_norm": 0.9907955527305603, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 5950 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7827328443527222, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 5960 + }, + { + "epoch": 5.0, + "grad_norm": 0.818544328212738, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5970 + }, + { + "epoch": 5.0, + "eval_loss": 1.9436752796173096, + "eval_runtime": 38.087, + "eval_samples_per_second": 13.522, + "eval_steps_per_second": 1.707, + "step": 5970 + }, + { + "epoch": 5.008375209380235, + "grad_norm": 1.1248953342437744, + "learning_rate": 0.0002, + "loss": 1.2367, + "step": 5980 + }, + { + "epoch": 5.016750418760469, + "grad_norm": 0.9285888075828552, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 5990 + }, + { + "epoch": 5.025125628140704, + "grad_norm": 0.8626338839530945, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6000 + }, + { + "epoch": 5.033500837520938, + "grad_norm": 0.8253921270370483, + "learning_rate": 0.0002, + "loss": 1.1839, + "step": 6010 + }, + { + "epoch": 5.041876046901173, + "grad_norm": 1.079628586769104, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 6020 + }, + { + "epoch": 5.050251256281407, + "grad_norm": 0.902625322341919, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 6030 + }, + { + "epoch": 5.058626465661642, + "grad_norm": 0.9593151211738586, + "learning_rate": 0.0002, + "loss": 1.164, + "step": 6040 + }, + { + "epoch": 5.067001675041876, + "grad_norm": 0.9276060461997986, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 6050 + }, + { + "epoch": 5.075376884422111, + "grad_norm": 1.0472362041473389, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 6060 + }, + { + "epoch": 5.083752093802345, + "grad_norm": 0.9126865863800049, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 6070 + }, + { + "epoch": 5.09212730318258, + "grad_norm": 1.0797888040542603, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 6080 + }, + { + "epoch": 5.100502512562814, + "grad_norm": 0.9538877010345459, + "learning_rate": 0.0002, + "loss": 1.2299, + "step": 6090 + }, + { + "epoch": 5.108877721943049, + "grad_norm": 1.0604161024093628, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 6100 + }, + { + "epoch": 5.117252931323283, + "grad_norm": 1.0178192853927612, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6110 + }, + { + "epoch": 5.125628140703517, + "grad_norm": 1.0262689590454102, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 6120 + }, + { + "epoch": 5.134003350083752, + "grad_norm": 0.9046729803085327, + "learning_rate": 0.0002, + "loss": 1.228, + "step": 6130 + }, + { + "epoch": 5.142378559463987, + "grad_norm": 1.1244608163833618, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 6140 + }, + { + "epoch": 5.150753768844221, + "grad_norm": 1.082835078239441, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 6150 + }, + { + "epoch": 5.159128978224456, + "grad_norm": 0.9078734517097473, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 6160 + }, + { + "epoch": 5.16750418760469, + "grad_norm": 1.0688848495483398, + "learning_rate": 0.0002, + "loss": 1.2122, + "step": 6170 + }, + { + "epoch": 5.175879396984925, + "grad_norm": 1.137519359588623, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 6180 + }, + { + "epoch": 5.184254606365159, + "grad_norm": 1.0728670358657837, + "learning_rate": 0.0002, + "loss": 1.3125, + "step": 6190 + }, + { + "epoch": 5.192629815745394, + "grad_norm": 1.2384949922561646, + "learning_rate": 0.0002, + "loss": 1.2352, + "step": 6200 + }, + { + "epoch": 5.201005025125628, + "grad_norm": 0.8391274809837341, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 6210 + }, + { + "epoch": 5.209380234505863, + "grad_norm": 0.8948764801025391, + "learning_rate": 0.0002, + "loss": 1.2179, + "step": 6220 + }, + { + "epoch": 5.217755443886097, + "grad_norm": 0.9568309783935547, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6230 + }, + { + "epoch": 5.226130653266332, + "grad_norm": 1.0604485273361206, + "learning_rate": 0.0002, + "loss": 1.2761, + "step": 6240 + }, + { + "epoch": 5.234505862646566, + "grad_norm": 1.1278935670852661, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 6250 + }, + { + "epoch": 5.242881072026801, + "grad_norm": 0.9903607368469238, + "learning_rate": 0.0002, + "loss": 1.2332, + "step": 6260 + }, + { + "epoch": 5.251256281407035, + "grad_norm": 0.958718478679657, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 6270 + }, + { + "epoch": 5.259631490787269, + "grad_norm": 1.127510905265808, + "learning_rate": 0.0002, + "loss": 1.2746, + "step": 6280 + }, + { + "epoch": 5.268006700167504, + "grad_norm": 1.1683127880096436, + "learning_rate": 0.0002, + "loss": 1.2589, + "step": 6290 + }, + { + "epoch": 5.276381909547739, + "grad_norm": 1.0723326206207275, + "learning_rate": 0.0002, + "loss": 1.2959, + "step": 6300 + }, + { + "epoch": 5.284757118927973, + "grad_norm": 0.9285374283790588, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 6310 + }, + { + "epoch": 5.293132328308207, + "grad_norm": 0.9201741218566895, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6320 + }, + { + "epoch": 5.301507537688442, + "grad_norm": 0.9606702923774719, + "learning_rate": 0.0002, + "loss": 1.1816, + "step": 6330 + }, + { + "epoch": 5.309882747068677, + "grad_norm": 1.107960820198059, + "learning_rate": 0.0002, + "loss": 1.2928, + "step": 6340 + }, + { + "epoch": 5.318257956448911, + "grad_norm": 0.9342933297157288, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 6350 + }, + { + "epoch": 5.326633165829146, + "grad_norm": 0.9170576930046082, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 6360 + }, + { + "epoch": 5.33500837520938, + "grad_norm": 0.7612091898918152, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 6370 + }, + { + "epoch": 5.343383584589615, + "grad_norm": 1.2524093389511108, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 6380 + }, + { + "epoch": 5.351758793969849, + "grad_norm": 0.8481650352478027, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 6390 + }, + { + "epoch": 5.360134003350084, + "grad_norm": 1.0562204122543335, + "learning_rate": 0.0002, + "loss": 1.237, + "step": 6400 + }, + { + "epoch": 5.368509212730318, + "grad_norm": 0.96522456407547, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 6410 + }, + { + "epoch": 5.376884422110553, + "grad_norm": 0.9680143594741821, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 6420 + }, + { + "epoch": 5.385259631490787, + "grad_norm": 0.9743781685829163, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 6430 + }, + { + "epoch": 5.393634840871022, + "grad_norm": 0.8907374143600464, + "learning_rate": 0.0002, + "loss": 1.2637, + "step": 6440 + }, + { + "epoch": 5.402010050251256, + "grad_norm": 1.3755217790603638, + "learning_rate": 0.0002, + "loss": 1.2174, + "step": 6450 + }, + { + "epoch": 5.410385259631491, + "grad_norm": 1.1926233768463135, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 6460 + }, + { + "epoch": 5.418760469011725, + "grad_norm": 0.8343448638916016, + "learning_rate": 0.0002, + "loss": 1.1685, + "step": 6470 + }, + { + "epoch": 5.42713567839196, + "grad_norm": 1.0056027173995972, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 6480 + }, + { + "epoch": 5.435510887772194, + "grad_norm": 0.9482131600379944, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 6490 + }, + { + "epoch": 5.443886097152429, + "grad_norm": 0.9766585826873779, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 6500 + }, + { + "epoch": 5.452261306532663, + "grad_norm": 0.9226584434509277, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 6510 + }, + { + "epoch": 5.460636515912898, + "grad_norm": 0.9605025053024292, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 6520 + }, + { + "epoch": 5.469011725293132, + "grad_norm": 1.0022773742675781, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 6530 + }, + { + "epoch": 5.477386934673367, + "grad_norm": 1.056764841079712, + "learning_rate": 0.0002, + "loss": 1.3126, + "step": 6540 + }, + { + "epoch": 5.485762144053601, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 6550 + }, + { + "epoch": 5.494137353433836, + "grad_norm": 0.8987206816673279, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 6560 + }, + { + "epoch": 5.50251256281407, + "grad_norm": 1.1946845054626465, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 6570 + }, + { + "epoch": 5.510887772194305, + "grad_norm": 1.037416696548462, + "learning_rate": 0.0002, + "loss": 1.2613, + "step": 6580 + }, + { + "epoch": 5.519262981574539, + "grad_norm": 1.085598349571228, + "learning_rate": 0.0002, + "loss": 1.2873, + "step": 6590 + }, + { + "epoch": 5.527638190954773, + "grad_norm": 0.9253745079040527, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6600 + }, + { + "epoch": 5.536013400335008, + "grad_norm": 1.0624418258666992, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6610 + }, + { + "epoch": 5.544388609715243, + "grad_norm": 1.002821922302246, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6620 + }, + { + "epoch": 5.552763819095477, + "grad_norm": 0.9343662858009338, + "learning_rate": 0.0002, + "loss": 1.2662, + "step": 6630 + }, + { + "epoch": 5.561139028475711, + "grad_norm": 0.9129965305328369, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6640 + }, + { + "epoch": 5.569514237855946, + "grad_norm": 1.220263957977295, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 6650 + }, + { + "epoch": 5.577889447236181, + "grad_norm": 0.9705421924591064, + "learning_rate": 0.0002, + "loss": 1.2638, + "step": 6660 + }, + { + "epoch": 5.586264656616415, + "grad_norm": 0.8417587876319885, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 6670 + }, + { + "epoch": 5.594639865996649, + "grad_norm": 0.9351304769515991, + "learning_rate": 0.0002, + "loss": 1.3616, + "step": 6680 + }, + { + "epoch": 5.603015075376884, + "grad_norm": 1.012598991394043, + "learning_rate": 0.0002, + "loss": 1.2795, + "step": 6690 + }, + { + "epoch": 5.611390284757119, + "grad_norm": 1.018328309059143, + "learning_rate": 0.0002, + "loss": 1.2457, + "step": 6700 + }, + { + "epoch": 5.619765494137353, + "grad_norm": 0.9289278388023376, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 6710 + }, + { + "epoch": 5.628140703517588, + "grad_norm": 0.8390841484069824, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 6720 + }, + { + "epoch": 5.636515912897822, + "grad_norm": 0.9989390969276428, + "learning_rate": 0.0002, + "loss": 1.2676, + "step": 6730 + }, + { + "epoch": 5.644891122278057, + "grad_norm": 1.0675761699676514, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 6740 + }, + { + "epoch": 5.653266331658291, + "grad_norm": 1.0649791955947876, + "learning_rate": 0.0002, + "loss": 1.2599, + "step": 6750 + }, + { + "epoch": 5.661641541038526, + "grad_norm": 0.8542222380638123, + "learning_rate": 0.0002, + "loss": 1.2191, + "step": 6760 + }, + { + "epoch": 5.67001675041876, + "grad_norm": 0.9148173928260803, + "learning_rate": 0.0002, + "loss": 1.2336, + "step": 6770 + }, + { + "epoch": 5.678391959798995, + "grad_norm": 0.978024423122406, + "learning_rate": 0.0002, + "loss": 1.3286, + "step": 6780 + }, + { + "epoch": 5.686767169179229, + "grad_norm": 1.0385138988494873, + "learning_rate": 0.0002, + "loss": 1.2821, + "step": 6790 + }, + { + "epoch": 5.695142378559464, + "grad_norm": 0.9687889218330383, + "learning_rate": 0.0002, + "loss": 1.218, + "step": 6800 + }, + { + "epoch": 5.703517587939698, + "grad_norm": 0.862335205078125, + "learning_rate": 0.0002, + "loss": 1.3256, + "step": 6810 + }, + { + "epoch": 5.711892797319933, + "grad_norm": 0.9729578495025635, + "learning_rate": 0.0002, + "loss": 1.2783, + "step": 6820 + }, + { + "epoch": 5.720268006700167, + "grad_norm": 0.8936806321144104, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6830 + }, + { + "epoch": 5.728643216080402, + "grad_norm": 0.9222455620765686, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 6840 + }, + { + "epoch": 5.7370184254606365, + "grad_norm": 1.0584437847137451, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 6850 + }, + { + "epoch": 5.745393634840871, + "grad_norm": 0.9114518165588379, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 6860 + }, + { + "epoch": 5.7537688442211055, + "grad_norm": 0.9590078592300415, + "learning_rate": 0.0002, + "loss": 1.2767, + "step": 6870 + }, + { + "epoch": 5.76214405360134, + "grad_norm": 0.9056822061538696, + "learning_rate": 0.0002, + "loss": 1.2639, + "step": 6880 + }, + { + "epoch": 5.7705192629815745, + "grad_norm": 1.0069063901901245, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 6890 + }, + { + "epoch": 5.778894472361809, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 1.3382, + "step": 6900 + }, + { + "epoch": 5.7872696817420435, + "grad_norm": 0.881629228591919, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 6910 + }, + { + "epoch": 5.795644891122278, + "grad_norm": 1.1020095348358154, + "learning_rate": 0.0002, + "loss": 1.3122, + "step": 6920 + }, + { + "epoch": 5.8040201005025125, + "grad_norm": 0.8774619102478027, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 6930 + }, + { + "epoch": 5.812395309882747, + "grad_norm": 0.9321739673614502, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 6940 + }, + { + "epoch": 5.8207705192629815, + "grad_norm": 0.9082857966423035, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 6950 + }, + { + "epoch": 5.8291457286432165, + "grad_norm": 0.9119554758071899, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6960 + }, + { + "epoch": 5.8375209380234505, + "grad_norm": 1.0643284320831299, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 6970 + }, + { + "epoch": 5.8458961474036855, + "grad_norm": 0.8526089787483215, + "learning_rate": 0.0002, + "loss": 1.3319, + "step": 6980 + }, + { + "epoch": 5.8542713567839195, + "grad_norm": 0.930439829826355, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6990 + }, + { + "epoch": 5.8626465661641545, + "grad_norm": 1.0461677312850952, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 7000 + }, + { + "epoch": 5.8710217755443885, + "grad_norm": 0.92561936378479, + "learning_rate": 0.0002, + "loss": 1.2623, + "step": 7010 + }, + { + "epoch": 5.8793969849246235, + "grad_norm": 0.8936395049095154, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 7020 + }, + { + "epoch": 5.8877721943048575, + "grad_norm": 0.986539363861084, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 7030 + }, + { + "epoch": 5.8961474036850925, + "grad_norm": 0.8776476383209229, + "learning_rate": 0.0002, + "loss": 1.2399, + "step": 7040 + }, + { + "epoch": 5.9045226130653266, + "grad_norm": 1.0256905555725098, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 7050 + }, + { + "epoch": 5.9128978224455615, + "grad_norm": 0.96241295337677, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 7060 + }, + { + "epoch": 5.921273031825796, + "grad_norm": 1.0251280069351196, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 7070 + }, + { + "epoch": 5.9296482412060305, + "grad_norm": 1.0794076919555664, + "learning_rate": 0.0002, + "loss": 1.2225, + "step": 7080 + }, + { + "epoch": 5.938023450586265, + "grad_norm": 0.9852448105812073, + "learning_rate": 0.0002, + "loss": 1.2978, + "step": 7090 + }, + { + "epoch": 5.9463986599664995, + "grad_norm": 1.1678671836853027, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 7100 + }, + { + "epoch": 5.954773869346734, + "grad_norm": 0.9818310141563416, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 7110 + }, + { + "epoch": 5.9631490787269685, + "grad_norm": 1.0732046365737915, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 7120 + }, + { + "epoch": 5.971524288107203, + "grad_norm": 0.912470281124115, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 7130 + }, + { + "epoch": 5.9798994974874375, + "grad_norm": 1.0944788455963135, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 7140 + }, + { + "epoch": 5.988274706867672, + "grad_norm": 1.0393965244293213, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 7150 + }, + { + "epoch": 5.9966499162479066, + "grad_norm": 0.8758739233016968, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 7160 + }, + { + "epoch": 6.0, + "eval_loss": 2.0526134967803955, + "eval_runtime": 37.9699, + "eval_samples_per_second": 13.563, + "eval_steps_per_second": 1.712, + "step": 7164 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.315338791543112e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-7164/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..09a5a9534e476c1a0d525202de0d7b7311c4e725 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ac009446d566f7a06dd36662f565a8f10486c27eaaa41f6b239034b99545d0 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2be1ff0a0a32855dac28e420e9a5e2eea8884aeb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad3f814311686751a6d282b4824b064c223a32e5704f316f3256685bb8a6be39 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b40671fec9fe5e1eeacef6981d857054f7e8b52e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5e63935e954434886648187a851bcaea0326154973f513282dd8416aec9161 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..00128d8dff795a0ebf2fe46f75ea8b2de20c59c6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d286288e9bc5e2bc4cdca79db9af793735711a191840e5d0d6ad71b684226b7 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..298057e2b05aef6b047cd0c24fccca1fb9d12715 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/trainer_state.json @@ -0,0 +1,5934 @@ +{ + "best_metric": 1.8061236143112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 8358, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.4256260097026825, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1200 + }, + { + "epoch": 1.0134003350083751, + "grad_norm": 0.35050156712532043, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 1.0217755443886096, + "grad_norm": 0.34773948788642883, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 1220 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.35487470030784607, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.37040361762046814, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1240 + }, + { + "epoch": 1.0469011725293131, + "grad_norm": 0.33740508556365967, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1250 + }, + { + "epoch": 1.0552763819095476, + "grad_norm": 0.3962724506855011, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 1260 + }, + { + "epoch": 1.0636515912897822, + "grad_norm": 0.3129824101924896, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1270 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.3620055019855499, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1280 + }, + { + "epoch": 1.0804020100502512, + "grad_norm": 0.3480982184410095, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 1290 + }, + { + "epoch": 1.0887772194304857, + "grad_norm": 0.344424843788147, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1300 + }, + { + "epoch": 1.0971524288107202, + "grad_norm": 0.3480122685432434, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1310 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.323662132024765, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1320 + }, + { + "epoch": 1.1139028475711892, + "grad_norm": 0.35440102219581604, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1330 + }, + { + "epoch": 1.1222780569514237, + "grad_norm": 0.3342263698577881, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 1340 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.35705259442329407, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.38021907210350037, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1360 + }, + { + "epoch": 1.1474036850921272, + "grad_norm": 0.34918731451034546, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1370 + }, + { + "epoch": 1.1557788944723617, + "grad_norm": 0.371868371963501, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 1380 + }, + { + "epoch": 1.1641541038525962, + "grad_norm": 0.38413912057876587, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1390 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3898005187511444, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1400 + }, + { + "epoch": 1.1809045226130652, + "grad_norm": 0.3726498484611511, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 1410 + }, + { + "epoch": 1.1892797319932997, + "grad_norm": 0.3532905876636505, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 1420 + }, + { + "epoch": 1.1976549413735342, + "grad_norm": 0.338127464056015, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1430 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.3472749888896942, + "learning_rate": 0.0002, + "loss": 1.871, + "step": 1440 + }, + { + "epoch": 1.2144053601340032, + "grad_norm": 0.3523476719856262, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1450 + }, + { + "epoch": 1.2227805695142377, + "grad_norm": 0.42986124753952026, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 1460 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 0.38195517659187317, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 1470 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.31665122509002686, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 1480 + }, + { + "epoch": 1.2479061976549413, + "grad_norm": 0.3539541959762573, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 1490 + }, + { + "epoch": 1.2562814070351758, + "grad_norm": 0.40162816643714905, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1500 + }, + { + "epoch": 1.2646566164154103, + "grad_norm": 0.34727150201797485, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 1510 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3364993929862976, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1520 + }, + { + "epoch": 1.2814070351758793, + "grad_norm": 0.323483943939209, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 1530 + }, + { + "epoch": 1.2897822445561138, + "grad_norm": 0.4114733934402466, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 1540 + }, + { + "epoch": 1.2981574539363483, + "grad_norm": 0.37476620078086853, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1550 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.4216269552707672, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1560 + }, + { + "epoch": 1.3149078726968173, + "grad_norm": 0.3204927444458008, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1570 + }, + { + "epoch": 1.3232830820770518, + "grad_norm": 0.36916354298591614, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1580 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.3755691647529602, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 1590 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3688889443874359, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1600 + }, + { + "epoch": 1.3484087102177553, + "grad_norm": 0.34306398034095764, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 1610 + }, + { + "epoch": 1.3567839195979898, + "grad_norm": 0.3651525676250458, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 1620 + }, + { + "epoch": 1.3651591289782243, + "grad_norm": 0.3461526036262512, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1630 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37959185242652893, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1640 + }, + { + "epoch": 1.3819095477386933, + "grad_norm": 0.4005356431007385, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1650 + }, + { + "epoch": 1.3902847571189278, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 1660 + }, + { + "epoch": 1.3986599664991624, + "grad_norm": 0.38220855593681335, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 1670 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1680 + }, + { + "epoch": 1.4154103852596314, + "grad_norm": 0.40028059482574463, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 1690 + }, + { + "epoch": 1.4237855946398659, + "grad_norm": 0.3953610360622406, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 1700 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.39524543285369873, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1710 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.37721359729766846, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 1720 + }, + { + "epoch": 1.4489112227805694, + "grad_norm": 0.4220093786716461, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 1730 + }, + { + "epoch": 1.457286432160804, + "grad_norm": 0.3876369595527649, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 1740 + }, + { + "epoch": 1.4656616415410384, + "grad_norm": 0.3774619400501251, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1750 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.3608052432537079, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 1760 + }, + { + "epoch": 1.4824120603015074, + "grad_norm": 0.32083916664123535, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 1770 + }, + { + "epoch": 1.490787269681742, + "grad_norm": 0.32290884852409363, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1780 + }, + { + "epoch": 1.4991624790619764, + "grad_norm": 0.3537974953651428, + "learning_rate": 0.0002, + "loss": 1.7648, + "step": 1790 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.36576104164123535, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 1800 + }, + { + "epoch": 1.5159128978224454, + "grad_norm": 0.3336752653121948, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1810 + }, + { + "epoch": 1.52428810720268, + "grad_norm": 0.3551652431488037, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1820 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.43313586711883545, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 1830 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.39160311222076416, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 1840 + }, + { + "epoch": 1.5494137353433834, + "grad_norm": 0.38758179545402527, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1850 + }, + { + "epoch": 1.557788944723618, + "grad_norm": 0.3658832013607025, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 1860 + }, + { + "epoch": 1.5661641541038525, + "grad_norm": 0.375372052192688, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1870 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3586942255496979, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1880 + }, + { + "epoch": 1.5829145728643215, + "grad_norm": 0.3626467287540436, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1890 + }, + { + "epoch": 1.591289782244556, + "grad_norm": 0.4199363589286804, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1900 + }, + { + "epoch": 1.5996649916247905, + "grad_norm": 0.35646331310272217, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 1910 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3465106189250946, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1920 + }, + { + "epoch": 1.6164154103852595, + "grad_norm": 0.43392884731292725, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 1930 + }, + { + "epoch": 1.624790619765494, + "grad_norm": 0.39187198877334595, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 1940 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.3685080409049988, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1950 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.4044491946697235, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 1960 + }, + { + "epoch": 1.6499162479061975, + "grad_norm": 0.4388049244880676, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 1970 + }, + { + "epoch": 1.658291457286432, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 1980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3501148521900177, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1990 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3751881718635559, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2000 + }, + { + "epoch": 1.6834170854271355, + "grad_norm": 0.3902788460254669, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 2010 + }, + { + "epoch": 1.69179229480737, + "grad_norm": 0.39642134308815, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 1.7001675041876045, + "grad_norm": 0.35721203684806824, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 2030 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.360419899225235, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2040 + }, + { + "epoch": 1.7169179229480735, + "grad_norm": 0.3755600154399872, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 2050 + }, + { + "epoch": 1.725293132328308, + "grad_norm": 0.3939184844493866, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 2060 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.33955490589141846, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 2070 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.35501939058303833, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2080 + }, + { + "epoch": 1.7504187604690116, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 2090 + }, + { + "epoch": 1.758793969849246, + "grad_norm": 0.3472785949707031, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 2100 + }, + { + "epoch": 1.7671691792294806, + "grad_norm": 0.3620430827140808, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2110 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 2120 + }, + { + "epoch": 1.7839195979899496, + "grad_norm": 0.3662523925304413, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 2130 + }, + { + "epoch": 1.792294807370184, + "grad_norm": 0.4113886058330536, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2140 + }, + { + "epoch": 1.8006700167504186, + "grad_norm": 0.3765672743320465, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2150 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.41623714566230774, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 2160 + }, + { + "epoch": 1.8174204355108876, + "grad_norm": 0.3724099099636078, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2170 + }, + { + "epoch": 1.8257956448911221, + "grad_norm": 0.3990779221057892, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 2180 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.3677702844142914, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 2190 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.3944959342479706, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 2200 + }, + { + "epoch": 1.8509212730318256, + "grad_norm": 0.3413957357406616, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2210 + }, + { + "epoch": 1.8592964824120601, + "grad_norm": 0.40136098861694336, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 2220 + }, + { + "epoch": 1.8676716917922946, + "grad_norm": 0.3496319055557251, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 2230 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3759860694408417, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2240 + }, + { + "epoch": 1.8844221105527639, + "grad_norm": 0.43556007742881775, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 2250 + }, + { + "epoch": 1.8927973199329984, + "grad_norm": 0.3864828944206238, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 2260 + }, + { + "epoch": 1.9011725293132329, + "grad_norm": 0.396930456161499, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 2270 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37667879462242126, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 2280 + }, + { + "epoch": 1.917922948073702, + "grad_norm": 0.3539164066314697, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 2290 + }, + { + "epoch": 1.9262981574539364, + "grad_norm": 0.40542101860046387, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 2300 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.37341606616973877, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 2310 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.4011504352092743, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.95142378559464, + "grad_norm": 0.37934592366218567, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 2330 + }, + { + "epoch": 1.9597989949748744, + "grad_norm": 0.32745009660720825, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 2340 + }, + { + "epoch": 1.968174204355109, + "grad_norm": 0.38347750902175903, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2350 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3945120871067047, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 2360 + }, + { + "epoch": 1.984924623115578, + "grad_norm": 0.4034058749675751, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 2370 + }, + { + "epoch": 1.9932998324958124, + "grad_norm": 0.3546718955039978, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.8061236143112183, + "eval_runtime": 38.2113, + "eval_samples_per_second": 13.478, + "eval_steps_per_second": 1.701, + "step": 2388 + }, + { + "epoch": 2.0016750418760467, + "grad_norm": 0.35184019804000854, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 2390 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.40416669845581055, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2400 + }, + { + "epoch": 2.0184254606365157, + "grad_norm": 0.3824569880962372, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2410 + }, + { + "epoch": 2.0268006700167502, + "grad_norm": 0.42036163806915283, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2420 + }, + { + "epoch": 2.0351758793969847, + "grad_norm": 0.40417996048927307, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2430 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.45298922061920166, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 2440 + }, + { + "epoch": 2.0519262981574538, + "grad_norm": 0.48289841413497925, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 2450 + }, + { + "epoch": 2.0603015075376883, + "grad_norm": 0.43702399730682373, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 2460 + }, + { + "epoch": 2.0686767169179228, + "grad_norm": 0.49487054347991943, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2470 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.40030500292778015, + "learning_rate": 0.0002, + "loss": 1.6552, + "step": 2480 + }, + { + "epoch": 2.0854271356783918, + "grad_norm": 0.4664880037307739, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 2490 + }, + { + "epoch": 2.0938023450586263, + "grad_norm": 0.4111400842666626, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 2500 + }, + { + "epoch": 2.102177554438861, + "grad_norm": 0.4155750572681427, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 2510 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.39257505536079407, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2520 + }, + { + "epoch": 2.11892797319933, + "grad_norm": 0.4156777560710907, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 2530 + }, + { + "epoch": 2.1273031825795643, + "grad_norm": 0.4025181233882904, + "learning_rate": 0.0002, + "loss": 1.6695, + "step": 2540 + }, + { + "epoch": 2.135678391959799, + "grad_norm": 0.42347562313079834, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2550 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.47068294882774353, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2560 + }, + { + "epoch": 2.152428810720268, + "grad_norm": 0.44081777334213257, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 2570 + }, + { + "epoch": 2.1608040201005023, + "grad_norm": 0.44823798537254333, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2580 + }, + { + "epoch": 2.169179229480737, + "grad_norm": 0.40486326813697815, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2590 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.454236775636673, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2600 + }, + { + "epoch": 2.185929648241206, + "grad_norm": 0.42555344104766846, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2610 + }, + { + "epoch": 2.1943048576214403, + "grad_norm": 0.5607381463050842, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2620 + }, + { + "epoch": 2.202680067001675, + "grad_norm": 0.4095611870288849, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 2630 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.419342577457428, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2640 + }, + { + "epoch": 2.219430485762144, + "grad_norm": 0.48541849851608276, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2650 + }, + { + "epoch": 2.2278056951423784, + "grad_norm": 0.4365246891975403, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 2660 + }, + { + "epoch": 2.236180904522613, + "grad_norm": 0.46417000889778137, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 2670 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5034580230712891, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 2680 + }, + { + "epoch": 2.2529313232830823, + "grad_norm": 0.44852879643440247, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2690 + }, + { + "epoch": 2.2613065326633164, + "grad_norm": 0.43886998295783997, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 2700 + }, + { + "epoch": 2.2696817420435513, + "grad_norm": 0.45762625336647034, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 2710 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.39429017901420593, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2720 + }, + { + "epoch": 2.2864321608040203, + "grad_norm": 0.4420442581176758, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2730 + }, + { + "epoch": 2.2948073701842544, + "grad_norm": 0.4327794015407562, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 2740 + }, + { + "epoch": 2.3031825795644894, + "grad_norm": 0.4303780198097229, + "learning_rate": 0.0002, + "loss": 1.6405, + "step": 2750 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.41379377245903015, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 2760 + }, + { + "epoch": 2.3199329983249584, + "grad_norm": 0.4821205735206604, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 2770 + }, + { + "epoch": 2.3283082077051924, + "grad_norm": 0.46232181787490845, + "learning_rate": 0.0002, + "loss": 1.6694, + "step": 2780 + }, + { + "epoch": 2.3366834170854274, + "grad_norm": 0.44937554001808167, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 2790 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.443250447511673, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2800 + }, + { + "epoch": 2.3534338358458964, + "grad_norm": 0.4687805473804474, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2810 + }, + { + "epoch": 2.3618090452261304, + "grad_norm": 0.435031920671463, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2820 + }, + { + "epoch": 2.3701842546063654, + "grad_norm": 0.4949858784675598, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 2830 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.46349018812179565, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 2840 + }, + { + "epoch": 2.3869346733668344, + "grad_norm": 0.46377238631248474, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2850 + }, + { + "epoch": 2.3953098827470685, + "grad_norm": 0.6111940741539001, + "learning_rate": 0.0002, + "loss": 1.5384, + "step": 2860 + }, + { + "epoch": 2.4036850921273034, + "grad_norm": 0.45090532302856445, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2870 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.4762120842933655, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2880 + }, + { + "epoch": 2.4204355108877724, + "grad_norm": 0.4397919774055481, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 2890 + }, + { + "epoch": 2.4288107202680065, + "grad_norm": 0.4765152335166931, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2900 + }, + { + "epoch": 2.4371859296482414, + "grad_norm": 0.4347304403781891, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2910 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.3918324410915375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 2920 + }, + { + "epoch": 2.4539363484087104, + "grad_norm": 0.43932855129241943, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 2930 + }, + { + "epoch": 2.4623115577889445, + "grad_norm": 0.46946918964385986, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 2940 + }, + { + "epoch": 2.4706867671691795, + "grad_norm": 0.45169174671173096, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2950 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.43488186597824097, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2960 + }, + { + "epoch": 2.4874371859296485, + "grad_norm": 0.42297765612602234, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2970 + }, + { + "epoch": 2.4958123953098825, + "grad_norm": 0.4546392560005188, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2980 + }, + { + "epoch": 2.5041876046901175, + "grad_norm": 0.4236692488193512, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 2990 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.46421024203300476, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 3000 + }, + { + "epoch": 2.5209380234505865, + "grad_norm": 0.5040220618247986, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 3010 + }, + { + "epoch": 2.5293132328308205, + "grad_norm": 0.4596138894557953, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.5376884422110555, + "grad_norm": 0.4410228729248047, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3030 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.553693413734436, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3040 + }, + { + "epoch": 2.5544388609715245, + "grad_norm": 0.41298043727874756, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 3050 + }, + { + "epoch": 2.5628140703517586, + "grad_norm": 0.4894513487815857, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 3060 + }, + { + "epoch": 2.5711892797319935, + "grad_norm": 0.5525603294372559, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3070 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.5043630003929138, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3080 + }, + { + "epoch": 2.5879396984924625, + "grad_norm": 0.4690920412540436, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 3090 + }, + { + "epoch": 2.5963149078726966, + "grad_norm": 0.4358677566051483, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 3100 + }, + { + "epoch": 2.6046901172529315, + "grad_norm": 0.4621894061565399, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 3110 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.4639507532119751, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 3120 + }, + { + "epoch": 2.6214405360134005, + "grad_norm": 0.45161309838294983, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 3130 + }, + { + "epoch": 2.6298157453936346, + "grad_norm": 0.49179261922836304, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 3140 + }, + { + "epoch": 2.6381909547738696, + "grad_norm": 0.4739720821380615, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 3150 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.468252956867218, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3160 + }, + { + "epoch": 2.6549413735343386, + "grad_norm": 0.44691553711891174, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 3170 + }, + { + "epoch": 2.6633165829145726, + "grad_norm": 0.47537046670913696, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 3180 + }, + { + "epoch": 2.6716917922948076, + "grad_norm": 0.4445202052593231, + "learning_rate": 0.0002, + "loss": 1.6755, + "step": 3190 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.46785518527030945, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3200 + }, + { + "epoch": 2.6884422110552766, + "grad_norm": 0.4807088077068329, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 3210 + }, + { + "epoch": 2.6968174204355106, + "grad_norm": 0.4547516703605652, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 3220 + }, + { + "epoch": 2.7051926298157456, + "grad_norm": 0.5200821161270142, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 3230 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.4915551245212555, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3240 + }, + { + "epoch": 2.7219430485762146, + "grad_norm": 0.4324817955493927, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3250 + }, + { + "epoch": 2.7303182579564487, + "grad_norm": 0.6290464997291565, + "learning_rate": 0.0002, + "loss": 1.6154, + "step": 3260 + }, + { + "epoch": 2.7386934673366836, + "grad_norm": 0.42255541682243347, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 3270 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.47089505195617676, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 3280 + }, + { + "epoch": 2.7554438860971526, + "grad_norm": 0.4492960572242737, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3290 + }, + { + "epoch": 2.7638190954773867, + "grad_norm": 0.4711938202381134, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3300 + }, + { + "epoch": 2.7721943048576216, + "grad_norm": 0.4635316729545593, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 3310 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.4207742512226105, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3320 + }, + { + "epoch": 2.7889447236180906, + "grad_norm": 0.5545504093170166, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3330 + }, + { + "epoch": 2.7973199329983247, + "grad_norm": 0.46976953744888306, + "learning_rate": 0.0002, + "loss": 1.6642, + "step": 3340 + }, + { + "epoch": 2.8056951423785597, + "grad_norm": 0.4805937111377716, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 3350 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.4986467659473419, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 3360 + }, + { + "epoch": 2.8224455611390287, + "grad_norm": 0.44702932238578796, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 3370 + }, + { + "epoch": 2.8308207705192627, + "grad_norm": 0.4698854088783264, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 3380 + }, + { + "epoch": 2.8391959798994977, + "grad_norm": 0.5756528377532959, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 3390 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.4266531765460968, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 3400 + }, + { + "epoch": 2.8559463986599667, + "grad_norm": 0.5342442989349365, + "learning_rate": 0.0002, + "loss": 1.6351, + "step": 3410 + }, + { + "epoch": 2.8643216080402008, + "grad_norm": 0.47210443019866943, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 3420 + }, + { + "epoch": 2.8726968174204357, + "grad_norm": 0.4491795599460602, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3430 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5387647151947021, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 3440 + }, + { + "epoch": 2.8894472361809047, + "grad_norm": 0.5059208273887634, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3450 + }, + { + "epoch": 2.8978224455611388, + "grad_norm": 0.472605437040329, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3460 + }, + { + "epoch": 2.9061976549413737, + "grad_norm": 0.499795138835907, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3470 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.4887969493865967, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 3480 + }, + { + "epoch": 2.9229480737018427, + "grad_norm": 0.4670022130012512, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.931323283082077, + "grad_norm": 0.4475444555282593, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 3500 + }, + { + "epoch": 2.9396984924623117, + "grad_norm": 0.39244669675827026, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 3510 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4905056059360504, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3520 + }, + { + "epoch": 2.9564489112227808, + "grad_norm": 0.4395551085472107, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 3530 + }, + { + "epoch": 2.964824120603015, + "grad_norm": 0.4693661034107208, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3540 + }, + { + "epoch": 2.9731993299832498, + "grad_norm": 0.473781943321228, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 3550 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4374050796031952, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3560 + }, + { + "epoch": 2.9899497487437188, + "grad_norm": 0.46144190430641174, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 3570 + }, + { + "epoch": 2.998324958123953, + "grad_norm": 0.43887680768966675, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 3580 + }, + { + "epoch": 3.0, + "eval_loss": 1.8283122777938843, + "eval_runtime": 38.023, + "eval_samples_per_second": 13.544, + "eval_steps_per_second": 1.709, + "step": 3582 + }, + { + "epoch": 3.006700167504188, + "grad_norm": 0.6784713268280029, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3590 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5783940553665161, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 3600 + }, + { + "epoch": 3.023450586264657, + "grad_norm": 0.5408937335014343, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 3610 + }, + { + "epoch": 3.0318257956448913, + "grad_norm": 0.5229013562202454, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3620 + }, + { + "epoch": 3.040201005025126, + "grad_norm": 0.49160143733024597, + "learning_rate": 0.0002, + "loss": 1.4835, + "step": 3630 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.6563201546669006, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 3640 + }, + { + "epoch": 3.056951423785595, + "grad_norm": 0.5686020851135254, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3650 + }, + { + "epoch": 3.0653266331658293, + "grad_norm": 0.5774043202400208, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3660 + }, + { + "epoch": 3.073701842546064, + "grad_norm": 0.6106171011924744, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 3670 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.517433226108551, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3680 + }, + { + "epoch": 3.090452261306533, + "grad_norm": 0.5681702494621277, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3690 + }, + { + "epoch": 3.0988274706867673, + "grad_norm": 0.5769233107566833, + "learning_rate": 0.0002, + "loss": 1.4731, + "step": 3700 + }, + { + "epoch": 3.107202680067002, + "grad_norm": 0.5657462477684021, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 3710 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.6035246253013611, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3720 + }, + { + "epoch": 3.123953098827471, + "grad_norm": 0.7286643385887146, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 3730 + }, + { + "epoch": 3.1323283082077054, + "grad_norm": 0.5121201872825623, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 3740 + }, + { + "epoch": 3.14070351758794, + "grad_norm": 0.5074213147163391, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 3750 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.57481849193573, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 3760 + }, + { + "epoch": 3.157453936348409, + "grad_norm": 0.6326663494110107, + "learning_rate": 0.0002, + "loss": 1.4765, + "step": 3770 + }, + { + "epoch": 3.1658291457286434, + "grad_norm": 0.6039315462112427, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3780 + }, + { + "epoch": 3.174204355108878, + "grad_norm": 0.6936715245246887, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 3790 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.6516796946525574, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3800 + }, + { + "epoch": 3.190954773869347, + "grad_norm": 0.6140730977058411, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3810 + }, + { + "epoch": 3.1993299832495814, + "grad_norm": 0.631328284740448, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3820 + }, + { + "epoch": 3.207705192629816, + "grad_norm": 0.6265402436256409, + "learning_rate": 0.0002, + "loss": 1.4844, + "step": 3830 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6649428606033325, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3840 + }, + { + "epoch": 3.224455611390285, + "grad_norm": 0.5329259634017944, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 3850 + }, + { + "epoch": 3.2328308207705194, + "grad_norm": 0.6008304953575134, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 3860 + }, + { + "epoch": 3.241206030150754, + "grad_norm": 0.5918582081794739, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 3870 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.643622100353241, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3880 + }, + { + "epoch": 3.257956448911223, + "grad_norm": 0.5517964363098145, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 3890 + }, + { + "epoch": 3.2663316582914574, + "grad_norm": 0.6780755519866943, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 3900 + }, + { + "epoch": 3.274706867671692, + "grad_norm": 0.6742202639579773, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3910 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.6228749752044678, + "learning_rate": 0.0002, + "loss": 1.5279, + "step": 3920 + }, + { + "epoch": 3.291457286432161, + "grad_norm": 0.5836303234100342, + "learning_rate": 0.0002, + "loss": 1.4899, + "step": 3930 + }, + { + "epoch": 3.2998324958123955, + "grad_norm": 0.6337724328041077, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3940 + }, + { + "epoch": 3.30820770519263, + "grad_norm": 0.6345084309577942, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 3950 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.6125303506851196, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3960 + }, + { + "epoch": 3.324958123953099, + "grad_norm": 0.6259911060333252, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 3970 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.645745575428009, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3980 + }, + { + "epoch": 3.341708542713568, + "grad_norm": 0.6666176915168762, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 3990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.59013831615448, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 4000 + }, + { + "epoch": 3.358458961474037, + "grad_norm": 0.6604634523391724, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 4010 + }, + { + "epoch": 3.3668341708542715, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 4020 + }, + { + "epoch": 3.375209380234506, + "grad_norm": 0.515724778175354, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 4030 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.681968092918396, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4040 + }, + { + "epoch": 3.391959798994975, + "grad_norm": 0.5978158116340637, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4050 + }, + { + "epoch": 3.4003350083752095, + "grad_norm": 0.6043432354927063, + "learning_rate": 0.0002, + "loss": 1.5449, + "step": 4060 + }, + { + "epoch": 3.408710217755444, + "grad_norm": 0.5899770855903625, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4070 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6014242172241211, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 4080 + }, + { + "epoch": 3.425460636515913, + "grad_norm": 0.5944811105728149, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 4090 + }, + { + "epoch": 3.4338358458961475, + "grad_norm": 0.6506822109222412, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 4100 + }, + { + "epoch": 3.442211055276382, + "grad_norm": 0.6926528811454773, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 4110 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.5646378993988037, + "learning_rate": 0.0002, + "loss": 1.5169, + "step": 4120 + }, + { + "epoch": 3.458961474036851, + "grad_norm": 0.7233654856681824, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 4130 + }, + { + "epoch": 3.4673366834170856, + "grad_norm": 0.6231815814971924, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4140 + }, + { + "epoch": 3.47571189279732, + "grad_norm": 0.6115689873695374, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 4150 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5812674760818481, + "learning_rate": 0.0002, + "loss": 1.4621, + "step": 4160 + }, + { + "epoch": 3.492462311557789, + "grad_norm": 0.6099632978439331, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 4170 + }, + { + "epoch": 3.5008375209380236, + "grad_norm": 0.6102647185325623, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 4180 + }, + { + "epoch": 3.509212730318258, + "grad_norm": 0.6034680008888245, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4190 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.6281666159629822, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 4200 + }, + { + "epoch": 3.525963149078727, + "grad_norm": 0.6245372295379639, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 4210 + }, + { + "epoch": 3.5343383584589616, + "grad_norm": 0.5897293090820312, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 4220 + }, + { + "epoch": 3.542713567839196, + "grad_norm": 0.601054847240448, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4230 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.7004473805427551, + "learning_rate": 0.0002, + "loss": 1.4974, + "step": 4240 + }, + { + "epoch": 3.559463986599665, + "grad_norm": 0.6601553559303284, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 4250 + }, + { + "epoch": 3.5678391959798996, + "grad_norm": 0.6112467050552368, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4260 + }, + { + "epoch": 3.576214405360134, + "grad_norm": 0.5902454853057861, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4270 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5792450904846191, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4280 + }, + { + "epoch": 3.592964824120603, + "grad_norm": 0.5923888087272644, + "learning_rate": 0.0002, + "loss": 1.4664, + "step": 4290 + }, + { + "epoch": 3.6013400335008376, + "grad_norm": 0.5869482159614563, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 4300 + }, + { + "epoch": 3.609715242881072, + "grad_norm": 0.6372929811477661, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 4310 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.6350686550140381, + "learning_rate": 0.0002, + "loss": 1.4977, + "step": 4320 + }, + { + "epoch": 3.626465661641541, + "grad_norm": 0.571819007396698, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4330 + }, + { + "epoch": 3.6348408710217757, + "grad_norm": 0.592250645160675, + "learning_rate": 0.0002, + "loss": 1.5414, + "step": 4340 + }, + { + "epoch": 3.64321608040201, + "grad_norm": 0.6110650897026062, + "learning_rate": 0.0002, + "loss": 1.4912, + "step": 4350 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.6187081336975098, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 4360 + }, + { + "epoch": 3.659966499162479, + "grad_norm": 0.6197671890258789, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 4370 + }, + { + "epoch": 3.6683417085427137, + "grad_norm": 0.6050862669944763, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4380 + }, + { + "epoch": 3.676716917922948, + "grad_norm": 0.621265172958374, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 4390 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.6552940011024475, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 4400 + }, + { + "epoch": 3.693467336683417, + "grad_norm": 0.5638861060142517, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 4410 + }, + { + "epoch": 3.7018425460636517, + "grad_norm": 0.6388863325119019, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4420 + }, + { + "epoch": 3.710217755443886, + "grad_norm": 0.6062559485435486, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 4430 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5800350308418274, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 4440 + }, + { + "epoch": 3.726968174204355, + "grad_norm": 0.5954474210739136, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 4450 + }, + { + "epoch": 3.7353433835845897, + "grad_norm": 0.5880125761032104, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 4460 + }, + { + "epoch": 3.7437185929648242, + "grad_norm": 0.5880921483039856, + "learning_rate": 0.0002, + "loss": 1.5055, + "step": 4470 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.5995073914527893, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 4480 + }, + { + "epoch": 3.7604690117252932, + "grad_norm": 0.5958493947982788, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 4490 + }, + { + "epoch": 3.7688442211055277, + "grad_norm": 0.5694711804389954, + "learning_rate": 0.0002, + "loss": 1.5472, + "step": 4500 + }, + { + "epoch": 3.7772194304857623, + "grad_norm": 0.6175141930580139, + "learning_rate": 0.0002, + "loss": 1.5105, + "step": 4510 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.5541581511497498, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 4520 + }, + { + "epoch": 3.7939698492462313, + "grad_norm": 0.5986164808273315, + "learning_rate": 0.0002, + "loss": 1.5283, + "step": 4530 + }, + { + "epoch": 3.8023450586264658, + "grad_norm": 0.640072226524353, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4540 + }, + { + "epoch": 3.8107202680067003, + "grad_norm": 0.5742579698562622, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4550 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.6658656001091003, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4560 + }, + { + "epoch": 3.8274706867671693, + "grad_norm": 0.5475369691848755, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 4570 + }, + { + "epoch": 3.835845896147404, + "grad_norm": 0.613172173500061, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 4580 + }, + { + "epoch": 3.8442211055276383, + "grad_norm": 0.590968132019043, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4590 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.5865461826324463, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4600 + }, + { + "epoch": 3.8609715242881073, + "grad_norm": 0.6815178990364075, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4610 + }, + { + "epoch": 3.869346733668342, + "grad_norm": 0.6551400423049927, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4620 + }, + { + "epoch": 3.8777219430485763, + "grad_norm": 0.6398897171020508, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 4630 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6761762499809265, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 4640 + }, + { + "epoch": 3.8944723618090453, + "grad_norm": 0.6277294754981995, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 4650 + }, + { + "epoch": 3.90284757118928, + "grad_norm": 0.6285301446914673, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 4660 + }, + { + "epoch": 3.9112227805695143, + "grad_norm": 0.5416069626808167, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4670 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.6314545273780823, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4680 + }, + { + "epoch": 3.9279731993299833, + "grad_norm": 0.604479968547821, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 4690 + }, + { + "epoch": 3.936348408710218, + "grad_norm": 0.5321660041809082, + "learning_rate": 0.0002, + "loss": 1.5186, + "step": 4700 + }, + { + "epoch": 3.9447236180904524, + "grad_norm": 0.6632516980171204, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 4710 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.5925896763801575, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 4720 + }, + { + "epoch": 3.9614740368509214, + "grad_norm": 0.6580308675765991, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 4730 + }, + { + "epoch": 3.969849246231156, + "grad_norm": 0.5578170418739319, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 4740 + }, + { + "epoch": 3.9782244556113904, + "grad_norm": 0.6216608285903931, + "learning_rate": 0.0002, + "loss": 1.5394, + "step": 4750 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.5693069696426392, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 4760 + }, + { + "epoch": 3.9949748743718594, + "grad_norm": 0.5353434681892395, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 4770 + }, + { + "epoch": 4.0, + "eval_loss": 1.8809821605682373, + "eval_runtime": 37.9695, + "eval_samples_per_second": 13.564, + "eval_steps_per_second": 1.712, + "step": 4776 + }, + { + "epoch": 4.0033500837520934, + "grad_norm": 0.6117817759513855, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4780 + }, + { + "epoch": 4.011725293132328, + "grad_norm": 0.6816073656082153, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 4790 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.715548038482666, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 4800 + }, + { + "epoch": 4.028475711892797, + "grad_norm": 0.8585814833641052, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 4810 + }, + { + "epoch": 4.0368509212730315, + "grad_norm": 0.7372158765792847, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 4820 + }, + { + "epoch": 4.045226130653266, + "grad_norm": 0.8915117979049683, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4830 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.9323588013648987, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 4840 + }, + { + "epoch": 4.061976549413735, + "grad_norm": 0.9298437237739563, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4850 + }, + { + "epoch": 4.0703517587939695, + "grad_norm": 0.8541792035102844, + "learning_rate": 0.0002, + "loss": 1.4173, + "step": 4860 + }, + { + "epoch": 4.078726968174204, + "grad_norm": 0.7833571434020996, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 4870 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.9325295090675354, + "learning_rate": 0.0002, + "loss": 1.3835, + "step": 4880 + }, + { + "epoch": 4.0954773869346734, + "grad_norm": 0.7066370248794556, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 4890 + }, + { + "epoch": 4.1038525963149075, + "grad_norm": 0.712640643119812, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 4900 + }, + { + "epoch": 4.1122278056951425, + "grad_norm": 0.6970218420028687, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4910 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.7979312539100647, + "learning_rate": 0.0002, + "loss": 1.3805, + "step": 4920 + }, + { + "epoch": 4.1289782244556115, + "grad_norm": 0.7801558375358582, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4930 + }, + { + "epoch": 4.1373534338358455, + "grad_norm": 0.7505159974098206, + "learning_rate": 0.0002, + "loss": 1.3288, + "step": 4940 + }, + { + "epoch": 4.1457286432160805, + "grad_norm": 0.738201916217804, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 4950 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7736659049987793, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4960 + }, + { + "epoch": 4.1624790619765495, + "grad_norm": 0.7850064635276794, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 4970 + }, + { + "epoch": 4.1708542713567835, + "grad_norm": 0.8316620588302612, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 4980 + }, + { + "epoch": 4.1792294807370185, + "grad_norm": 0.7217330932617188, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 4990 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.7050199508666992, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 5000 + }, + { + "epoch": 4.1959798994974875, + "grad_norm": 0.6992659568786621, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 5010 + }, + { + "epoch": 4.204355108877722, + "grad_norm": 0.7648445963859558, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 5020 + }, + { + "epoch": 4.2127303182579565, + "grad_norm": 0.8093137741088867, + "learning_rate": 0.0002, + "loss": 1.3339, + "step": 5030 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.6907750368118286, + "learning_rate": 0.0002, + "loss": 1.37, + "step": 5040 + }, + { + "epoch": 4.2294807370184255, + "grad_norm": 0.7000078558921814, + "learning_rate": 0.0002, + "loss": 1.4231, + "step": 5050 + }, + { + "epoch": 4.23785594639866, + "grad_norm": 0.715034008026123, + "learning_rate": 0.0002, + "loss": 1.3411, + "step": 5060 + }, + { + "epoch": 4.2462311557788945, + "grad_norm": 0.828895628452301, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5070 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7127292156219482, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 5080 + }, + { + "epoch": 4.2629815745393635, + "grad_norm": 0.8256623148918152, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 5090 + }, + { + "epoch": 4.271356783919598, + "grad_norm": 0.8062452077865601, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5100 + }, + { + "epoch": 4.279731993299833, + "grad_norm": 0.6861081123352051, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5110 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 0.7566041350364685, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 5120 + }, + { + "epoch": 4.296482412060302, + "grad_norm": 0.8734753727912903, + "learning_rate": 0.0002, + "loss": 1.4571, + "step": 5130 + }, + { + "epoch": 4.304857621440536, + "grad_norm": 0.8559320569038391, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5140 + }, + { + "epoch": 4.313232830820771, + "grad_norm": 0.6965576410293579, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 5150 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.8277813792228699, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 5160 + }, + { + "epoch": 4.32998324958124, + "grad_norm": 1.0733633041381836, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5170 + }, + { + "epoch": 4.338358458961474, + "grad_norm": 0.7914809584617615, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5180 + }, + { + "epoch": 4.346733668341709, + "grad_norm": 0.8307849168777466, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 5190 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.7066516280174255, + "learning_rate": 0.0002, + "loss": 1.4318, + "step": 5200 + }, + { + "epoch": 4.363484087102178, + "grad_norm": 0.9676792025566101, + "learning_rate": 0.0002, + "loss": 1.3866, + "step": 5210 + }, + { + "epoch": 4.371859296482412, + "grad_norm": 0.7672301530838013, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5220 + }, + { + "epoch": 4.380234505862647, + "grad_norm": 0.6888260245323181, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5230 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.8775295615196228, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 5240 + }, + { + "epoch": 4.396984924623116, + "grad_norm": 0.8742642998695374, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 5250 + }, + { + "epoch": 4.40536013400335, + "grad_norm": 0.6935433745384216, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 5260 + }, + { + "epoch": 4.413735343383585, + "grad_norm": 0.7726178169250488, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5270 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.7493860721588135, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5280 + }, + { + "epoch": 4.430485762144054, + "grad_norm": 0.7758517265319824, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 5290 + }, + { + "epoch": 4.438860971524288, + "grad_norm": 0.779315173625946, + "learning_rate": 0.0002, + "loss": 1.2916, + "step": 5300 + }, + { + "epoch": 4.447236180904523, + "grad_norm": 0.7753667235374451, + "learning_rate": 0.0002, + "loss": 1.4483, + "step": 5310 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.8738188743591309, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 5320 + }, + { + "epoch": 4.463986599664992, + "grad_norm": 0.8410757184028625, + "learning_rate": 0.0002, + "loss": 1.41, + "step": 5330 + }, + { + "epoch": 4.472361809045226, + "grad_norm": 0.728897750377655, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 5340 + }, + { + "epoch": 4.480737018425461, + "grad_norm": 0.7880531549453735, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 5350 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.8455142378807068, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 5360 + }, + { + "epoch": 4.49748743718593, + "grad_norm": 0.8527868986129761, + "learning_rate": 0.0002, + "loss": 1.431, + "step": 5370 + }, + { + "epoch": 4.505862646566165, + "grad_norm": 0.7743009328842163, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5380 + }, + { + "epoch": 4.514237855946399, + "grad_norm": 0.7555320858955383, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 5390 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.8146619200706482, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5400 + }, + { + "epoch": 4.530988274706868, + "grad_norm": 0.8042502999305725, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5410 + }, + { + "epoch": 4.539363484087103, + "grad_norm": 0.7329140305519104, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5420 + }, + { + "epoch": 4.547738693467337, + "grad_norm": 0.7574753165245056, + "learning_rate": 0.0002, + "loss": 1.3946, + "step": 5430 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 1.1223409175872803, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 5440 + }, + { + "epoch": 4.564489112227806, + "grad_norm": 0.7647369503974915, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5450 + }, + { + "epoch": 4.572864321608041, + "grad_norm": 0.9135531187057495, + "learning_rate": 0.0002, + "loss": 1.4569, + "step": 5460 + }, + { + "epoch": 4.581239530988275, + "grad_norm": 0.9343693852424622, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5470 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.869945764541626, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 5480 + }, + { + "epoch": 4.597989949748744, + "grad_norm": 0.7383785843849182, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5490 + }, + { + "epoch": 4.606365159128979, + "grad_norm": 0.7988699674606323, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5500 + }, + { + "epoch": 4.614740368509213, + "grad_norm": 0.8731256127357483, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 5510 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.7577664256095886, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5520 + }, + { + "epoch": 4.631490787269682, + "grad_norm": 0.7825039625167847, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5530 + }, + { + "epoch": 4.639865996649917, + "grad_norm": 0.8534902930259705, + "learning_rate": 0.0002, + "loss": 1.4594, + "step": 5540 + }, + { + "epoch": 4.648241206030151, + "grad_norm": 0.7403318285942078, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 5550 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.8229990005493164, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5560 + }, + { + "epoch": 4.66499162479062, + "grad_norm": 0.8279513716697693, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5570 + }, + { + "epoch": 4.673366834170855, + "grad_norm": 0.8923851251602173, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5580 + }, + { + "epoch": 4.681742043551089, + "grad_norm": 0.7457540035247803, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5590 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7110715508460999, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 5600 + }, + { + "epoch": 4.698492462311558, + "grad_norm": 0.7135499119758606, + "learning_rate": 0.0002, + "loss": 1.4327, + "step": 5610 + }, + { + "epoch": 4.706867671691793, + "grad_norm": 0.7606837153434753, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 5620 + }, + { + "epoch": 4.715242881072027, + "grad_norm": 0.9622916579246521, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5630 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.7665684819221497, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 5640 + }, + { + "epoch": 4.731993299832496, + "grad_norm": 0.7985475659370422, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 5650 + }, + { + "epoch": 4.740368509212731, + "grad_norm": 0.9179279208183289, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 5660 + }, + { + "epoch": 4.748743718592965, + "grad_norm": 0.8311634063720703, + "learning_rate": 0.0002, + "loss": 1.4379, + "step": 5670 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.7773269414901733, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 5680 + }, + { + "epoch": 4.765494137353434, + "grad_norm": 0.7771748900413513, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5690 + }, + { + "epoch": 4.773869346733669, + "grad_norm": 0.7518507242202759, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 5700 + }, + { + "epoch": 4.782244556113903, + "grad_norm": 0.7699326276779175, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 5710 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7001115679740906, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5720 + }, + { + "epoch": 4.798994974874372, + "grad_norm": 0.7220682501792908, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 5730 + }, + { + "epoch": 4.807370184254607, + "grad_norm": 0.7654005289077759, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5740 + }, + { + "epoch": 4.815745393634841, + "grad_norm": 0.8132795095443726, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5750 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.7105404138565063, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5760 + }, + { + "epoch": 4.83249581239531, + "grad_norm": 0.9346209764480591, + "learning_rate": 0.0002, + "loss": 1.4289, + "step": 5770 + }, + { + "epoch": 4.840871021775545, + "grad_norm": 1.0075623989105225, + "learning_rate": 0.0002, + "loss": 1.4066, + "step": 5780 + }, + { + "epoch": 4.849246231155779, + "grad_norm": 0.758376955986023, + "learning_rate": 0.0002, + "loss": 1.4558, + "step": 5790 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.854821503162384, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 5800 + }, + { + "epoch": 4.865996649916248, + "grad_norm": 0.8226943016052246, + "learning_rate": 0.0002, + "loss": 1.4014, + "step": 5810 + }, + { + "epoch": 4.874371859296483, + "grad_norm": 0.7510473728179932, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 5820 + }, + { + "epoch": 4.882747068676717, + "grad_norm": 0.7449678182601929, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 5830 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.7840824723243713, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 5840 + }, + { + "epoch": 4.899497487437186, + "grad_norm": 0.8811169862747192, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5850 + }, + { + "epoch": 4.907872696817421, + "grad_norm": 0.84914630651474, + "learning_rate": 0.0002, + "loss": 1.3827, + "step": 5860 + }, + { + "epoch": 4.916247906197655, + "grad_norm": 0.7514461874961853, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 5870 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.7229002118110657, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5880 + }, + { + "epoch": 4.932998324958124, + "grad_norm": 0.9418245553970337, + "learning_rate": 0.0002, + "loss": 1.4302, + "step": 5890 + }, + { + "epoch": 4.941373534338359, + "grad_norm": 0.7626827359199524, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5900 + }, + { + "epoch": 4.949748743718593, + "grad_norm": 0.7711105346679688, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 5910 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8689648509025574, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 5920 + }, + { + "epoch": 4.966499162479062, + "grad_norm": 0.7873271107673645, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 5930 + }, + { + "epoch": 4.974874371859297, + "grad_norm": 0.7637495994567871, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5940 + }, + { + "epoch": 4.983249581239531, + "grad_norm": 0.9907955527305603, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 5950 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7827328443527222, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 5960 + }, + { + "epoch": 5.0, + "grad_norm": 0.818544328212738, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5970 + }, + { + "epoch": 5.0, + "eval_loss": 1.9436752796173096, + "eval_runtime": 38.087, + "eval_samples_per_second": 13.522, + "eval_steps_per_second": 1.707, + "step": 5970 + }, + { + "epoch": 5.008375209380235, + "grad_norm": 1.1248953342437744, + "learning_rate": 0.0002, + "loss": 1.2367, + "step": 5980 + }, + { + "epoch": 5.016750418760469, + "grad_norm": 0.9285888075828552, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 5990 + }, + { + "epoch": 5.025125628140704, + "grad_norm": 0.8626338839530945, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6000 + }, + { + "epoch": 5.033500837520938, + "grad_norm": 0.8253921270370483, + "learning_rate": 0.0002, + "loss": 1.1839, + "step": 6010 + }, + { + "epoch": 5.041876046901173, + "grad_norm": 1.079628586769104, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 6020 + }, + { + "epoch": 5.050251256281407, + "grad_norm": 0.902625322341919, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 6030 + }, + { + "epoch": 5.058626465661642, + "grad_norm": 0.9593151211738586, + "learning_rate": 0.0002, + "loss": 1.164, + "step": 6040 + }, + { + "epoch": 5.067001675041876, + "grad_norm": 0.9276060461997986, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 6050 + }, + { + "epoch": 5.075376884422111, + "grad_norm": 1.0472362041473389, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 6060 + }, + { + "epoch": 5.083752093802345, + "grad_norm": 0.9126865863800049, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 6070 + }, + { + "epoch": 5.09212730318258, + "grad_norm": 1.0797888040542603, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 6080 + }, + { + "epoch": 5.100502512562814, + "grad_norm": 0.9538877010345459, + "learning_rate": 0.0002, + "loss": 1.2299, + "step": 6090 + }, + { + "epoch": 5.108877721943049, + "grad_norm": 1.0604161024093628, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 6100 + }, + { + "epoch": 5.117252931323283, + "grad_norm": 1.0178192853927612, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6110 + }, + { + "epoch": 5.125628140703517, + "grad_norm": 1.0262689590454102, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 6120 + }, + { + "epoch": 5.134003350083752, + "grad_norm": 0.9046729803085327, + "learning_rate": 0.0002, + "loss": 1.228, + "step": 6130 + }, + { + "epoch": 5.142378559463987, + "grad_norm": 1.1244608163833618, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 6140 + }, + { + "epoch": 5.150753768844221, + "grad_norm": 1.082835078239441, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 6150 + }, + { + "epoch": 5.159128978224456, + "grad_norm": 0.9078734517097473, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 6160 + }, + { + "epoch": 5.16750418760469, + "grad_norm": 1.0688848495483398, + "learning_rate": 0.0002, + "loss": 1.2122, + "step": 6170 + }, + { + "epoch": 5.175879396984925, + "grad_norm": 1.137519359588623, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 6180 + }, + { + "epoch": 5.184254606365159, + "grad_norm": 1.0728670358657837, + "learning_rate": 0.0002, + "loss": 1.3125, + "step": 6190 + }, + { + "epoch": 5.192629815745394, + "grad_norm": 1.2384949922561646, + "learning_rate": 0.0002, + "loss": 1.2352, + "step": 6200 + }, + { + "epoch": 5.201005025125628, + "grad_norm": 0.8391274809837341, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 6210 + }, + { + "epoch": 5.209380234505863, + "grad_norm": 0.8948764801025391, + "learning_rate": 0.0002, + "loss": 1.2179, + "step": 6220 + }, + { + "epoch": 5.217755443886097, + "grad_norm": 0.9568309783935547, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6230 + }, + { + "epoch": 5.226130653266332, + "grad_norm": 1.0604485273361206, + "learning_rate": 0.0002, + "loss": 1.2761, + "step": 6240 + }, + { + "epoch": 5.234505862646566, + "grad_norm": 1.1278935670852661, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 6250 + }, + { + "epoch": 5.242881072026801, + "grad_norm": 0.9903607368469238, + "learning_rate": 0.0002, + "loss": 1.2332, + "step": 6260 + }, + { + "epoch": 5.251256281407035, + "grad_norm": 0.958718478679657, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 6270 + }, + { + "epoch": 5.259631490787269, + "grad_norm": 1.127510905265808, + "learning_rate": 0.0002, + "loss": 1.2746, + "step": 6280 + }, + { + "epoch": 5.268006700167504, + "grad_norm": 1.1683127880096436, + "learning_rate": 0.0002, + "loss": 1.2589, + "step": 6290 + }, + { + "epoch": 5.276381909547739, + "grad_norm": 1.0723326206207275, + "learning_rate": 0.0002, + "loss": 1.2959, + "step": 6300 + }, + { + "epoch": 5.284757118927973, + "grad_norm": 0.9285374283790588, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 6310 + }, + { + "epoch": 5.293132328308207, + "grad_norm": 0.9201741218566895, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6320 + }, + { + "epoch": 5.301507537688442, + "grad_norm": 0.9606702923774719, + "learning_rate": 0.0002, + "loss": 1.1816, + "step": 6330 + }, + { + "epoch": 5.309882747068677, + "grad_norm": 1.107960820198059, + "learning_rate": 0.0002, + "loss": 1.2928, + "step": 6340 + }, + { + "epoch": 5.318257956448911, + "grad_norm": 0.9342933297157288, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 6350 + }, + { + "epoch": 5.326633165829146, + "grad_norm": 0.9170576930046082, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 6360 + }, + { + "epoch": 5.33500837520938, + "grad_norm": 0.7612091898918152, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 6370 + }, + { + "epoch": 5.343383584589615, + "grad_norm": 1.2524093389511108, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 6380 + }, + { + "epoch": 5.351758793969849, + "grad_norm": 0.8481650352478027, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 6390 + }, + { + "epoch": 5.360134003350084, + "grad_norm": 1.0562204122543335, + "learning_rate": 0.0002, + "loss": 1.237, + "step": 6400 + }, + { + "epoch": 5.368509212730318, + "grad_norm": 0.96522456407547, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 6410 + }, + { + "epoch": 5.376884422110553, + "grad_norm": 0.9680143594741821, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 6420 + }, + { + "epoch": 5.385259631490787, + "grad_norm": 0.9743781685829163, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 6430 + }, + { + "epoch": 5.393634840871022, + "grad_norm": 0.8907374143600464, + "learning_rate": 0.0002, + "loss": 1.2637, + "step": 6440 + }, + { + "epoch": 5.402010050251256, + "grad_norm": 1.3755217790603638, + "learning_rate": 0.0002, + "loss": 1.2174, + "step": 6450 + }, + { + "epoch": 5.410385259631491, + "grad_norm": 1.1926233768463135, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 6460 + }, + { + "epoch": 5.418760469011725, + "grad_norm": 0.8343448638916016, + "learning_rate": 0.0002, + "loss": 1.1685, + "step": 6470 + }, + { + "epoch": 5.42713567839196, + "grad_norm": 1.0056027173995972, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 6480 + }, + { + "epoch": 5.435510887772194, + "grad_norm": 0.9482131600379944, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 6490 + }, + { + "epoch": 5.443886097152429, + "grad_norm": 0.9766585826873779, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 6500 + }, + { + "epoch": 5.452261306532663, + "grad_norm": 0.9226584434509277, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 6510 + }, + { + "epoch": 5.460636515912898, + "grad_norm": 0.9605025053024292, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 6520 + }, + { + "epoch": 5.469011725293132, + "grad_norm": 1.0022773742675781, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 6530 + }, + { + "epoch": 5.477386934673367, + "grad_norm": 1.056764841079712, + "learning_rate": 0.0002, + "loss": 1.3126, + "step": 6540 + }, + { + "epoch": 5.485762144053601, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 6550 + }, + { + "epoch": 5.494137353433836, + "grad_norm": 0.8987206816673279, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 6560 + }, + { + "epoch": 5.50251256281407, + "grad_norm": 1.1946845054626465, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 6570 + }, + { + "epoch": 5.510887772194305, + "grad_norm": 1.037416696548462, + "learning_rate": 0.0002, + "loss": 1.2613, + "step": 6580 + }, + { + "epoch": 5.519262981574539, + "grad_norm": 1.085598349571228, + "learning_rate": 0.0002, + "loss": 1.2873, + "step": 6590 + }, + { + "epoch": 5.527638190954773, + "grad_norm": 0.9253745079040527, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6600 + }, + { + "epoch": 5.536013400335008, + "grad_norm": 1.0624418258666992, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6610 + }, + { + "epoch": 5.544388609715243, + "grad_norm": 1.002821922302246, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6620 + }, + { + "epoch": 5.552763819095477, + "grad_norm": 0.9343662858009338, + "learning_rate": 0.0002, + "loss": 1.2662, + "step": 6630 + }, + { + "epoch": 5.561139028475711, + "grad_norm": 0.9129965305328369, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6640 + }, + { + "epoch": 5.569514237855946, + "grad_norm": 1.220263957977295, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 6650 + }, + { + "epoch": 5.577889447236181, + "grad_norm": 0.9705421924591064, + "learning_rate": 0.0002, + "loss": 1.2638, + "step": 6660 + }, + { + "epoch": 5.586264656616415, + "grad_norm": 0.8417587876319885, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 6670 + }, + { + "epoch": 5.594639865996649, + "grad_norm": 0.9351304769515991, + "learning_rate": 0.0002, + "loss": 1.3616, + "step": 6680 + }, + { + "epoch": 5.603015075376884, + "grad_norm": 1.012598991394043, + "learning_rate": 0.0002, + "loss": 1.2795, + "step": 6690 + }, + { + "epoch": 5.611390284757119, + "grad_norm": 1.018328309059143, + "learning_rate": 0.0002, + "loss": 1.2457, + "step": 6700 + }, + { + "epoch": 5.619765494137353, + "grad_norm": 0.9289278388023376, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 6710 + }, + { + "epoch": 5.628140703517588, + "grad_norm": 0.8390841484069824, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 6720 + }, + { + "epoch": 5.636515912897822, + "grad_norm": 0.9989390969276428, + "learning_rate": 0.0002, + "loss": 1.2676, + "step": 6730 + }, + { + "epoch": 5.644891122278057, + "grad_norm": 1.0675761699676514, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 6740 + }, + { + "epoch": 5.653266331658291, + "grad_norm": 1.0649791955947876, + "learning_rate": 0.0002, + "loss": 1.2599, + "step": 6750 + }, + { + "epoch": 5.661641541038526, + "grad_norm": 0.8542222380638123, + "learning_rate": 0.0002, + "loss": 1.2191, + "step": 6760 + }, + { + "epoch": 5.67001675041876, + "grad_norm": 0.9148173928260803, + "learning_rate": 0.0002, + "loss": 1.2336, + "step": 6770 + }, + { + "epoch": 5.678391959798995, + "grad_norm": 0.978024423122406, + "learning_rate": 0.0002, + "loss": 1.3286, + "step": 6780 + }, + { + "epoch": 5.686767169179229, + "grad_norm": 1.0385138988494873, + "learning_rate": 0.0002, + "loss": 1.2821, + "step": 6790 + }, + { + "epoch": 5.695142378559464, + "grad_norm": 0.9687889218330383, + "learning_rate": 0.0002, + "loss": 1.218, + "step": 6800 + }, + { + "epoch": 5.703517587939698, + "grad_norm": 0.862335205078125, + "learning_rate": 0.0002, + "loss": 1.3256, + "step": 6810 + }, + { + "epoch": 5.711892797319933, + "grad_norm": 0.9729578495025635, + "learning_rate": 0.0002, + "loss": 1.2783, + "step": 6820 + }, + { + "epoch": 5.720268006700167, + "grad_norm": 0.8936806321144104, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6830 + }, + { + "epoch": 5.728643216080402, + "grad_norm": 0.9222455620765686, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 6840 + }, + { + "epoch": 5.7370184254606365, + "grad_norm": 1.0584437847137451, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 6850 + }, + { + "epoch": 5.745393634840871, + "grad_norm": 0.9114518165588379, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 6860 + }, + { + "epoch": 5.7537688442211055, + "grad_norm": 0.9590078592300415, + "learning_rate": 0.0002, + "loss": 1.2767, + "step": 6870 + }, + { + "epoch": 5.76214405360134, + "grad_norm": 0.9056822061538696, + "learning_rate": 0.0002, + "loss": 1.2639, + "step": 6880 + }, + { + "epoch": 5.7705192629815745, + "grad_norm": 1.0069063901901245, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 6890 + }, + { + "epoch": 5.778894472361809, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 1.3382, + "step": 6900 + }, + { + "epoch": 5.7872696817420435, + "grad_norm": 0.881629228591919, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 6910 + }, + { + "epoch": 5.795644891122278, + "grad_norm": 1.1020095348358154, + "learning_rate": 0.0002, + "loss": 1.3122, + "step": 6920 + }, + { + "epoch": 5.8040201005025125, + "grad_norm": 0.8774619102478027, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 6930 + }, + { + "epoch": 5.812395309882747, + "grad_norm": 0.9321739673614502, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 6940 + }, + { + "epoch": 5.8207705192629815, + "grad_norm": 0.9082857966423035, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 6950 + }, + { + "epoch": 5.8291457286432165, + "grad_norm": 0.9119554758071899, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6960 + }, + { + "epoch": 5.8375209380234505, + "grad_norm": 1.0643284320831299, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 6970 + }, + { + "epoch": 5.8458961474036855, + "grad_norm": 0.8526089787483215, + "learning_rate": 0.0002, + "loss": 1.3319, + "step": 6980 + }, + { + "epoch": 5.8542713567839195, + "grad_norm": 0.930439829826355, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6990 + }, + { + "epoch": 5.8626465661641545, + "grad_norm": 1.0461677312850952, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 7000 + }, + { + "epoch": 5.8710217755443885, + "grad_norm": 0.92561936378479, + "learning_rate": 0.0002, + "loss": 1.2623, + "step": 7010 + }, + { + "epoch": 5.8793969849246235, + "grad_norm": 0.8936395049095154, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 7020 + }, + { + "epoch": 5.8877721943048575, + "grad_norm": 0.986539363861084, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 7030 + }, + { + "epoch": 5.8961474036850925, + "grad_norm": 0.8776476383209229, + "learning_rate": 0.0002, + "loss": 1.2399, + "step": 7040 + }, + { + "epoch": 5.9045226130653266, + "grad_norm": 1.0256905555725098, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 7050 + }, + { + "epoch": 5.9128978224455615, + "grad_norm": 0.96241295337677, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 7060 + }, + { + "epoch": 5.921273031825796, + "grad_norm": 1.0251280069351196, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 7070 + }, + { + "epoch": 5.9296482412060305, + "grad_norm": 1.0794076919555664, + "learning_rate": 0.0002, + "loss": 1.2225, + "step": 7080 + }, + { + "epoch": 5.938023450586265, + "grad_norm": 0.9852448105812073, + "learning_rate": 0.0002, + "loss": 1.2978, + "step": 7090 + }, + { + "epoch": 5.9463986599664995, + "grad_norm": 1.1678671836853027, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 7100 + }, + { + "epoch": 5.954773869346734, + "grad_norm": 0.9818310141563416, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 7110 + }, + { + "epoch": 5.9631490787269685, + "grad_norm": 1.0732046365737915, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 7120 + }, + { + "epoch": 5.971524288107203, + "grad_norm": 0.912470281124115, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 7130 + }, + { + "epoch": 5.9798994974874375, + "grad_norm": 1.0944788455963135, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 7140 + }, + { + "epoch": 5.988274706867672, + "grad_norm": 1.0393965244293213, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 7150 + }, + { + "epoch": 5.9966499162479066, + "grad_norm": 0.8758739233016968, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 7160 + }, + { + "epoch": 6.0, + "eval_loss": 2.0526134967803955, + "eval_runtime": 37.9699, + "eval_samples_per_second": 13.563, + "eval_steps_per_second": 1.712, + "step": 7164 + }, + { + "epoch": 6.005025125628141, + "grad_norm": 1.138184666633606, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 7170 + }, + { + "epoch": 6.013400335008376, + "grad_norm": 0.9295315742492676, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 7180 + }, + { + "epoch": 6.02177554438861, + "grad_norm": 1.1252633333206177, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 7190 + }, + { + "epoch": 6.030150753768845, + "grad_norm": 1.0611635446548462, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 7200 + }, + { + "epoch": 6.038525963149079, + "grad_norm": 1.022278070449829, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 7210 + }, + { + "epoch": 6.046901172529314, + "grad_norm": 1.0280728340148926, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 7220 + }, + { + "epoch": 6.055276381909548, + "grad_norm": 0.9516313076019287, + "learning_rate": 0.0002, + "loss": 1.0237, + "step": 7230 + }, + { + "epoch": 6.063651591289783, + "grad_norm": 1.0925321578979492, + "learning_rate": 0.0002, + "loss": 1.0388, + "step": 7240 + }, + { + "epoch": 6.072026800670017, + "grad_norm": 0.9885565042495728, + "learning_rate": 0.0002, + "loss": 1.113, + "step": 7250 + }, + { + "epoch": 6.080402010050252, + "grad_norm": 1.0905766487121582, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 7260 + }, + { + "epoch": 6.088777219430486, + "grad_norm": 1.075183391571045, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 7270 + }, + { + "epoch": 6.097152428810721, + "grad_norm": 1.0897727012634277, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 7280 + }, + { + "epoch": 6.105527638190955, + "grad_norm": 1.3677806854248047, + "learning_rate": 0.0002, + "loss": 1.0335, + "step": 7290 + }, + { + "epoch": 6.11390284757119, + "grad_norm": 1.1880329847335815, + "learning_rate": 0.0002, + "loss": 1.0566, + "step": 7300 + }, + { + "epoch": 6.122278056951424, + "grad_norm": 1.036330223083496, + "learning_rate": 0.0002, + "loss": 1.061, + "step": 7310 + }, + { + "epoch": 6.130653266331659, + "grad_norm": 1.2165348529815674, + "learning_rate": 0.0002, + "loss": 1.0621, + "step": 7320 + }, + { + "epoch": 6.139028475711893, + "grad_norm": 1.027368187904358, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 7330 + }, + { + "epoch": 6.147403685092128, + "grad_norm": 1.2497830390930176, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 7340 + }, + { + "epoch": 6.155778894472362, + "grad_norm": 1.166595458984375, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 7350 + }, + { + "epoch": 6.164154103852597, + "grad_norm": 1.1143730878829956, + "learning_rate": 0.0002, + "loss": 1.1301, + "step": 7360 + }, + { + "epoch": 6.172529313232831, + "grad_norm": 1.1531223058700562, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 7370 + }, + { + "epoch": 6.180904522613066, + "grad_norm": 1.176507830619812, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7380 + }, + { + "epoch": 6.1892797319933, + "grad_norm": 1.3174604177474976, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 7390 + }, + { + "epoch": 6.197654941373535, + "grad_norm": 1.0284459590911865, + "learning_rate": 0.0002, + "loss": 1.1586, + "step": 7400 + }, + { + "epoch": 6.206030150753769, + "grad_norm": 1.0801599025726318, + "learning_rate": 0.0002, + "loss": 1.1044, + "step": 7410 + }, + { + "epoch": 6.214405360134004, + "grad_norm": 1.200514554977417, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 7420 + }, + { + "epoch": 6.222780569514238, + "grad_norm": 1.0148060321807861, + "learning_rate": 0.0002, + "loss": 1.0234, + "step": 7430 + }, + { + "epoch": 6.231155778894473, + "grad_norm": 1.2368836402893066, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 7440 + }, + { + "epoch": 6.239530988274707, + "grad_norm": 1.228834629058838, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 7450 + }, + { + "epoch": 6.247906197654942, + "grad_norm": 1.1588891744613647, + "learning_rate": 0.0002, + "loss": 1.1128, + "step": 7460 + }, + { + "epoch": 6.256281407035176, + "grad_norm": 1.3500380516052246, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 7470 + }, + { + "epoch": 6.264656616415411, + "grad_norm": 1.1429533958435059, + "learning_rate": 0.0002, + "loss": 1.1057, + "step": 7480 + }, + { + "epoch": 6.273031825795645, + "grad_norm": 1.2314441204071045, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 7490 + }, + { + "epoch": 6.28140703517588, + "grad_norm": 1.0917996168136597, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 7500 + }, + { + "epoch": 6.289782244556114, + "grad_norm": 1.3294450044631958, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 7510 + }, + { + "epoch": 6.298157453936349, + "grad_norm": 1.1035195589065552, + "learning_rate": 0.0002, + "loss": 1.1187, + "step": 7520 + }, + { + "epoch": 6.306532663316583, + "grad_norm": 1.2643269300460815, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 7530 + }, + { + "epoch": 6.314907872696818, + "grad_norm": 1.2226417064666748, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7540 + }, + { + "epoch": 6.323283082077052, + "grad_norm": 1.0248615741729736, + "learning_rate": 0.0002, + "loss": 1.1335, + "step": 7550 + }, + { + "epoch": 6.331658291457287, + "grad_norm": 1.28317129611969, + "learning_rate": 0.0002, + "loss": 1.0856, + "step": 7560 + }, + { + "epoch": 6.340033500837521, + "grad_norm": 1.1461660861968994, + "learning_rate": 0.0002, + "loss": 1.166, + "step": 7570 + }, + { + "epoch": 6.348408710217756, + "grad_norm": 1.297136664390564, + "learning_rate": 0.0002, + "loss": 1.1627, + "step": 7580 + }, + { + "epoch": 6.35678391959799, + "grad_norm": 1.3376781940460205, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 7590 + }, + { + "epoch": 6.365159128978225, + "grad_norm": 1.2507376670837402, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 7600 + }, + { + "epoch": 6.373534338358459, + "grad_norm": 1.3255126476287842, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 7610 + }, + { + "epoch": 6.381909547738694, + "grad_norm": 1.1082066297531128, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 7620 + }, + { + "epoch": 6.390284757118928, + "grad_norm": 1.4461497068405151, + "learning_rate": 0.0002, + "loss": 1.0894, + "step": 7630 + }, + { + "epoch": 6.398659966499163, + "grad_norm": 1.2875033617019653, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 7640 + }, + { + "epoch": 6.407035175879397, + "grad_norm": 1.1017295122146606, + "learning_rate": 0.0002, + "loss": 1.1027, + "step": 7650 + }, + { + "epoch": 6.415410385259632, + "grad_norm": 1.1896536350250244, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 7660 + }, + { + "epoch": 6.423785594639866, + "grad_norm": 1.0939011573791504, + "learning_rate": 0.0002, + "loss": 1.1207, + "step": 7670 + }, + { + "epoch": 6.432160804020101, + "grad_norm": 1.2593132257461548, + "learning_rate": 0.0002, + "loss": 1.1338, + "step": 7680 + }, + { + "epoch": 6.440536013400335, + "grad_norm": 1.1151225566864014, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 7690 + }, + { + "epoch": 6.44891122278057, + "grad_norm": 1.0686280727386475, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 7700 + }, + { + "epoch": 6.457286432160804, + "grad_norm": 1.4008738994598389, + "learning_rate": 0.0002, + "loss": 1.1611, + "step": 7710 + }, + { + "epoch": 6.465661641541039, + "grad_norm": 1.1698687076568604, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 7720 + }, + { + "epoch": 6.474036850921273, + "grad_norm": 1.1306401491165161, + "learning_rate": 0.0002, + "loss": 1.1637, + "step": 7730 + }, + { + "epoch": 6.482412060301508, + "grad_norm": 1.2970236539840698, + "learning_rate": 0.0002, + "loss": 1.1534, + "step": 7740 + }, + { + "epoch": 6.490787269681742, + "grad_norm": 1.1515544652938843, + "learning_rate": 0.0002, + "loss": 1.1408, + "step": 7750 + }, + { + "epoch": 6.499162479061977, + "grad_norm": 1.13273024559021, + "learning_rate": 0.0002, + "loss": 1.098, + "step": 7760 + }, + { + "epoch": 6.507537688442211, + "grad_norm": 1.1635724306106567, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 7770 + }, + { + "epoch": 6.515912897822446, + "grad_norm": 1.1620264053344727, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 7780 + }, + { + "epoch": 6.52428810720268, + "grad_norm": 1.159905195236206, + "learning_rate": 0.0002, + "loss": 1.1786, + "step": 7790 + }, + { + "epoch": 6.532663316582915, + "grad_norm": 1.2243341207504272, + "learning_rate": 0.0002, + "loss": 1.1252, + "step": 7800 + }, + { + "epoch": 6.541038525963149, + "grad_norm": 1.1034481525421143, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 7810 + }, + { + "epoch": 6.549413735343384, + "grad_norm": 1.1131408214569092, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 7820 + }, + { + "epoch": 6.557788944723618, + "grad_norm": 1.211260199546814, + "learning_rate": 0.0002, + "loss": 1.1053, + "step": 7830 + }, + { + "epoch": 6.566164154103853, + "grad_norm": 1.408692717552185, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 7840 + }, + { + "epoch": 6.574539363484087, + "grad_norm": 1.151441216468811, + "learning_rate": 0.0002, + "loss": 1.1586, + "step": 7850 + }, + { + "epoch": 6.582914572864322, + "grad_norm": 1.1160012483596802, + "learning_rate": 0.0002, + "loss": 1.1754, + "step": 7860 + }, + { + "epoch": 6.591289782244556, + "grad_norm": 1.2496052980422974, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 7870 + }, + { + "epoch": 6.599664991624791, + "grad_norm": 1.559907078742981, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 7880 + }, + { + "epoch": 6.608040201005025, + "grad_norm": 1.4399309158325195, + "learning_rate": 0.0002, + "loss": 1.1482, + "step": 7890 + }, + { + "epoch": 6.61641541038526, + "grad_norm": 1.155007243156433, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 7900 + }, + { + "epoch": 6.624790619765494, + "grad_norm": 1.4339076280593872, + "learning_rate": 0.0002, + "loss": 1.2029, + "step": 7910 + }, + { + "epoch": 6.633165829145729, + "grad_norm": 1.2093058824539185, + "learning_rate": 0.0002, + "loss": 1.1594, + "step": 7920 + }, + { + "epoch": 6.641541038525963, + "grad_norm": 1.1619434356689453, + "learning_rate": 0.0002, + "loss": 1.185, + "step": 7930 + }, + { + "epoch": 6.649916247906198, + "grad_norm": 1.2879594564437866, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 7940 + }, + { + "epoch": 6.658291457286432, + "grad_norm": 1.0598394870758057, + "learning_rate": 0.0002, + "loss": 1.1992, + "step": 7950 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0937503576278687, + "learning_rate": 0.0002, + "loss": 1.1337, + "step": 7960 + }, + { + "epoch": 6.675041876046901, + "grad_norm": 1.2670115232467651, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 7970 + }, + { + "epoch": 6.683417085427136, + "grad_norm": 1.2351782321929932, + "learning_rate": 0.0002, + "loss": 1.1711, + "step": 7980 + }, + { + "epoch": 6.69179229480737, + "grad_norm": 1.344128131866455, + "learning_rate": 0.0002, + "loss": 1.1774, + "step": 7990 + }, + { + "epoch": 6.700167504187605, + "grad_norm": 1.2894740104675293, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 8000 + }, + { + "epoch": 6.708542713567839, + "grad_norm": 1.1804684400558472, + "learning_rate": 0.0002, + "loss": 1.1045, + "step": 8010 + }, + { + "epoch": 6.716917922948074, + "grad_norm": 1.314237356185913, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 8020 + }, + { + "epoch": 6.725293132328308, + "grad_norm": 1.2132530212402344, + "learning_rate": 0.0002, + "loss": 1.1113, + "step": 8030 + }, + { + "epoch": 6.733668341708543, + "grad_norm": 0.999580979347229, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 8040 + }, + { + "epoch": 6.742043551088777, + "grad_norm": 1.206323266029358, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 8050 + }, + { + "epoch": 6.750418760469012, + "grad_norm": 1.1092344522476196, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 8060 + }, + { + "epoch": 6.758793969849246, + "grad_norm": 1.0168755054473877, + "learning_rate": 0.0002, + "loss": 1.1583, + "step": 8070 + }, + { + "epoch": 6.767169179229481, + "grad_norm": 1.2310614585876465, + "learning_rate": 0.0002, + "loss": 1.189, + "step": 8080 + }, + { + "epoch": 6.775544388609715, + "grad_norm": 1.1587172746658325, + "learning_rate": 0.0002, + "loss": 1.1775, + "step": 8090 + }, + { + "epoch": 6.78391959798995, + "grad_norm": 1.1362504959106445, + "learning_rate": 0.0002, + "loss": 1.1761, + "step": 8100 + }, + { + "epoch": 6.792294807370184, + "grad_norm": 1.3735119104385376, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 8110 + }, + { + "epoch": 6.800670016750419, + "grad_norm": 1.1804813146591187, + "learning_rate": 0.0002, + "loss": 1.1214, + "step": 8120 + }, + { + "epoch": 6.809045226130653, + "grad_norm": 1.1849592924118042, + "learning_rate": 0.0002, + "loss": 1.1035, + "step": 8130 + }, + { + "epoch": 6.817420435510888, + "grad_norm": 1.1638602018356323, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 8140 + }, + { + "epoch": 6.825795644891122, + "grad_norm": 1.2106250524520874, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 8150 + }, + { + "epoch": 6.834170854271357, + "grad_norm": 1.276068091392517, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 8160 + }, + { + "epoch": 6.842546063651591, + "grad_norm": 1.4283488988876343, + "learning_rate": 0.0002, + "loss": 1.1309, + "step": 8170 + }, + { + "epoch": 6.850921273031826, + "grad_norm": 1.4286448955535889, + "learning_rate": 0.0002, + "loss": 1.1494, + "step": 8180 + }, + { + "epoch": 6.85929648241206, + "grad_norm": 1.191275715827942, + "learning_rate": 0.0002, + "loss": 1.185, + "step": 8190 + }, + { + "epoch": 6.867671691792295, + "grad_norm": 1.4232908487319946, + "learning_rate": 0.0002, + "loss": 1.1984, + "step": 8200 + }, + { + "epoch": 6.876046901172529, + "grad_norm": 1.2166317701339722, + "learning_rate": 0.0002, + "loss": 1.182, + "step": 8210 + }, + { + "epoch": 6.884422110552764, + "grad_norm": 1.0487027168273926, + "learning_rate": 0.0002, + "loss": 1.1311, + "step": 8220 + }, + { + "epoch": 6.892797319932998, + "grad_norm": 1.247178077697754, + "learning_rate": 0.0002, + "loss": 1.1973, + "step": 8230 + }, + { + "epoch": 6.901172529313233, + "grad_norm": 1.0728635787963867, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 8240 + }, + { + "epoch": 6.909547738693467, + "grad_norm": 1.1909451484680176, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 8250 + }, + { + "epoch": 6.917922948073702, + "grad_norm": 1.337556004524231, + "learning_rate": 0.0002, + "loss": 1.1336, + "step": 8260 + }, + { + "epoch": 6.926298157453936, + "grad_norm": 1.1479394435882568, + "learning_rate": 0.0002, + "loss": 1.2295, + "step": 8270 + }, + { + "epoch": 6.934673366834171, + "grad_norm": 1.2038872241973877, + "learning_rate": 0.0002, + "loss": 1.1497, + "step": 8280 + }, + { + "epoch": 6.943048576214405, + "grad_norm": 1.088813066482544, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 8290 + }, + { + "epoch": 6.95142378559464, + "grad_norm": 1.0153290033340454, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 8300 + }, + { + "epoch": 6.959798994974874, + "grad_norm": 1.2159703969955444, + "learning_rate": 0.0002, + "loss": 1.1846, + "step": 8310 + }, + { + "epoch": 6.968174204355109, + "grad_norm": 1.0844143629074097, + "learning_rate": 0.0002, + "loss": 1.1029, + "step": 8320 + }, + { + "epoch": 6.976549413735343, + "grad_norm": 1.1617385149002075, + "learning_rate": 0.0002, + "loss": 1.1843, + "step": 8330 + }, + { + "epoch": 6.984924623115578, + "grad_norm": 1.126503586769104, + "learning_rate": 0.0002, + "loss": 1.177, + "step": 8340 + }, + { + "epoch": 6.993299832495812, + "grad_norm": 1.1553548574447632, + "learning_rate": 0.0002, + "loss": 1.1753, + "step": 8350 + }, + { + "epoch": 7.0, + "eval_loss": 2.1463968753814697, + "eval_runtime": 37.9219, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 8358 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.867895256800297e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-8358/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2f0ce2c83666424375e684951a7e5159c1f8d707 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b49cd8fc7eadbf2ec77e0b4ab00602004f41a4a05e15b6eade6cc6c770e5faf +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..048c847d3742996c805b29ec4ad2f6a0d4fc0fdd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a63d12669a63f66e1d7ce22b0be701d40b6849b373c98b8b2659589f8aad12ca +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..34da23d84f04b31074026405be15b20ee229fb1a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5ba0ae6252fbab20ee649715f1cbbc89e378d2bfaf095aa1da3f781ff54e3bd +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..653f9514c32e69a24084c7271ed1f92e75f34b0f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79d19d45187183d51ea5b60dd806b17aaa200ec74942b47b4bed9544698502d5 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..172b9f2b6f998cc4d2def340ddace9b5bc64fb63 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/trainer_state.json @@ -0,0 +1,6782 @@ +{ + "best_metric": 1.8061236143112183, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 9552, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008375209380234505, + "grad_norm": 0.6290814280509949, + "learning_rate": 0.0002, + "loss": 2.6252, + "step": 10 + }, + { + "epoch": 0.01675041876046901, + "grad_norm": 0.5023976564407349, + "learning_rate": 0.0002, + "loss": 2.3237, + "step": 20 + }, + { + "epoch": 0.02512562814070352, + "grad_norm": 0.5448721647262573, + "learning_rate": 0.0002, + "loss": 2.1575, + "step": 30 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 0.4906269609928131, + "learning_rate": 0.0002, + "loss": 1.967, + "step": 40 + }, + { + "epoch": 0.04187604690117253, + "grad_norm": 0.49321722984313965, + "learning_rate": 0.0002, + "loss": 1.9464, + "step": 50 + }, + { + "epoch": 0.05025125628140704, + "grad_norm": 0.4470495581626892, + "learning_rate": 0.0002, + "loss": 1.9645, + "step": 60 + }, + { + "epoch": 0.05862646566164154, + "grad_norm": 0.49971723556518555, + "learning_rate": 0.0002, + "loss": 1.8989, + "step": 70 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.4249754548072815, + "learning_rate": 0.0002, + "loss": 1.8629, + "step": 80 + }, + { + "epoch": 0.07537688442211055, + "grad_norm": 0.43136730790138245, + "learning_rate": 0.0002, + "loss": 1.9229, + "step": 90 + }, + { + "epoch": 0.08375209380234507, + "grad_norm": 0.5939809679985046, + "learning_rate": 0.0002, + "loss": 1.8768, + "step": 100 + }, + { + "epoch": 0.09212730318257957, + "grad_norm": 0.4249511659145355, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 110 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.451865017414093, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 120 + }, + { + "epoch": 0.10887772194304858, + "grad_norm": 0.42394405603408813, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 130 + }, + { + "epoch": 0.11725293132328309, + "grad_norm": 0.3683006763458252, + "learning_rate": 0.0002, + "loss": 1.8411, + "step": 140 + }, + { + "epoch": 0.12562814070351758, + "grad_norm": 0.411150723695755, + "learning_rate": 0.0002, + "loss": 1.8605, + "step": 150 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4213576018810272, + "learning_rate": 0.0002, + "loss": 1.7842, + "step": 160 + }, + { + "epoch": 0.1423785594639866, + "grad_norm": 0.4385589361190796, + "learning_rate": 0.0002, + "loss": 1.8892, + "step": 170 + }, + { + "epoch": 0.1507537688442211, + "grad_norm": 0.4446942210197449, + "learning_rate": 0.0002, + "loss": 1.8369, + "step": 180 + }, + { + "epoch": 0.15912897822445563, + "grad_norm": 0.4562969207763672, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 190 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 0.49195992946624756, + "learning_rate": 0.0002, + "loss": 1.8848, + "step": 200 + }, + { + "epoch": 0.17587939698492464, + "grad_norm": 0.3948725461959839, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 210 + }, + { + "epoch": 0.18425460636515914, + "grad_norm": 0.37087398767471313, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 220 + }, + { + "epoch": 0.19262981574539365, + "grad_norm": 0.3847447633743286, + "learning_rate": 0.0002, + "loss": 1.8392, + "step": 230 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.3973361849784851, + "learning_rate": 0.0002, + "loss": 1.7498, + "step": 240 + }, + { + "epoch": 0.20938023450586266, + "grad_norm": 0.3675636947154999, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 250 + }, + { + "epoch": 0.21775544388609716, + "grad_norm": 0.38187175989151, + "learning_rate": 0.0002, + "loss": 1.8318, + "step": 260 + }, + { + "epoch": 0.22613065326633167, + "grad_norm": 0.36000028252601624, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 270 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 0.3819858729839325, + "learning_rate": 0.0002, + "loss": 1.8129, + "step": 280 + }, + { + "epoch": 0.24288107202680068, + "grad_norm": 0.36370471119880676, + "learning_rate": 0.0002, + "loss": 1.7971, + "step": 290 + }, + { + "epoch": 0.25125628140703515, + "grad_norm": 0.3492966294288635, + "learning_rate": 0.0002, + "loss": 1.8518, + "step": 300 + }, + { + "epoch": 0.25963149078726966, + "grad_norm": 0.32806646823883057, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 310 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.3824801743030548, + "learning_rate": 0.0002, + "loss": 1.8338, + "step": 320 + }, + { + "epoch": 0.27638190954773867, + "grad_norm": 0.48781588673591614, + "learning_rate": 0.0002, + "loss": 1.8702, + "step": 330 + }, + { + "epoch": 0.2847571189279732, + "grad_norm": 0.416357159614563, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 340 + }, + { + "epoch": 0.2931323283082077, + "grad_norm": 0.34518781304359436, + "learning_rate": 0.0002, + "loss": 1.8543, + "step": 350 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.3333123028278351, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 360 + }, + { + "epoch": 0.3098827470686767, + "grad_norm": 0.4125552475452423, + "learning_rate": 0.0002, + "loss": 1.7434, + "step": 370 + }, + { + "epoch": 0.31825795644891125, + "grad_norm": 0.40044137835502625, + "learning_rate": 0.0002, + "loss": 1.8679, + "step": 380 + }, + { + "epoch": 0.32663316582914576, + "grad_norm": 0.44981154799461365, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 390 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.6972532868385315, + "learning_rate": 0.0002, + "loss": 1.7907, + "step": 400 + }, + { + "epoch": 0.34338358458961477, + "grad_norm": 0.3069273829460144, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 410 + }, + { + "epoch": 0.35175879396984927, + "grad_norm": 0.35586047172546387, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 420 + }, + { + "epoch": 0.3601340033500838, + "grad_norm": 0.40816494822502136, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 430 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 0.3377438187599182, + "learning_rate": 0.0002, + "loss": 1.8004, + "step": 440 + }, + { + "epoch": 0.3768844221105528, + "grad_norm": 0.31523144245147705, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.3852596314907873, + "grad_norm": 0.3472132682800293, + "learning_rate": 0.0002, + "loss": 1.771, + "step": 460 + }, + { + "epoch": 0.3936348408710218, + "grad_norm": 0.3513853847980499, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 470 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.366720587015152, + "learning_rate": 0.0002, + "loss": 1.7818, + "step": 480 + }, + { + "epoch": 0.4103852596314908, + "grad_norm": 0.48535996675491333, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 490 + }, + { + "epoch": 0.4187604690117253, + "grad_norm": 0.378305584192276, + "learning_rate": 0.0002, + "loss": 1.8674, + "step": 500 + }, + { + "epoch": 0.4271356783919598, + "grad_norm": 0.31175753474235535, + "learning_rate": 0.0002, + "loss": 1.8145, + "step": 510 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 0.3505520820617676, + "learning_rate": 0.0002, + "loss": 1.7745, + "step": 520 + }, + { + "epoch": 0.4438860971524288, + "grad_norm": 0.3446848690509796, + "learning_rate": 0.0002, + "loss": 1.8194, + "step": 530 + }, + { + "epoch": 0.45226130653266333, + "grad_norm": 0.3255297541618347, + "learning_rate": 0.0002, + "loss": 1.7787, + "step": 540 + }, + { + "epoch": 0.46063651591289784, + "grad_norm": 0.3216710686683655, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 550 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.3307957649230957, + "learning_rate": 0.0002, + "loss": 1.7919, + "step": 560 + }, + { + "epoch": 0.47738693467336685, + "grad_norm": 0.3295125663280487, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 570 + }, + { + "epoch": 0.48576214405360135, + "grad_norm": 0.349960595369339, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 580 + }, + { + "epoch": 0.49413735343383586, + "grad_norm": 0.32447564601898193, + "learning_rate": 0.0002, + "loss": 1.8474, + "step": 590 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.3343949615955353, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.5108877721943048, + "grad_norm": 0.3556120991706848, + "learning_rate": 0.0002, + "loss": 1.7856, + "step": 610 + }, + { + "epoch": 0.5192629815745393, + "grad_norm": 0.38598525524139404, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 620 + }, + { + "epoch": 0.5276381909547738, + "grad_norm": 0.3493153154850006, + "learning_rate": 0.0002, + "loss": 1.7857, + "step": 630 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.35715600848197937, + "learning_rate": 0.0002, + "loss": 1.7699, + "step": 640 + }, + { + "epoch": 0.5443886097152428, + "grad_norm": 0.3686097264289856, + "learning_rate": 0.0002, + "loss": 1.8295, + "step": 650 + }, + { + "epoch": 0.5527638190954773, + "grad_norm": 0.32571321725845337, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 660 + }, + { + "epoch": 0.5611390284757118, + "grad_norm": 0.33986029028892517, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 670 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 0.33575883507728577, + "learning_rate": 0.0002, + "loss": 1.7874, + "step": 680 + }, + { + "epoch": 0.5778894472361809, + "grad_norm": 0.30621081590652466, + "learning_rate": 0.0002, + "loss": 1.8046, + "step": 690 + }, + { + "epoch": 0.5862646566164154, + "grad_norm": 0.30717912316322327, + "learning_rate": 0.0002, + "loss": 1.797, + "step": 700 + }, + { + "epoch": 0.5946398659966499, + "grad_norm": 0.33896031975746155, + "learning_rate": 0.0002, + "loss": 1.7696, + "step": 710 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.35164183378219604, + "learning_rate": 0.0002, + "loss": 1.8045, + "step": 720 + }, + { + "epoch": 0.6113902847571189, + "grad_norm": 0.47714051604270935, + "learning_rate": 0.0002, + "loss": 1.8606, + "step": 730 + }, + { + "epoch": 0.6197654941373534, + "grad_norm": 0.34266430139541626, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 740 + }, + { + "epoch": 0.628140703517588, + "grad_norm": 0.354221910238266, + "learning_rate": 0.0002, + "loss": 1.756, + "step": 750 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 0.3694717586040497, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 760 + }, + { + "epoch": 0.644891122278057, + "grad_norm": 0.35219788551330566, + "learning_rate": 0.0002, + "loss": 1.7441, + "step": 770 + }, + { + "epoch": 0.6532663316582915, + "grad_norm": 0.31869757175445557, + "learning_rate": 0.0002, + "loss": 1.8616, + "step": 780 + }, + { + "epoch": 0.661641541038526, + "grad_norm": 0.3729475736618042, + "learning_rate": 0.0002, + "loss": 1.7981, + "step": 790 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 0.3431633710861206, + "learning_rate": 0.0002, + "loss": 1.8384, + "step": 800 + }, + { + "epoch": 0.678391959798995, + "grad_norm": 0.3452960252761841, + "learning_rate": 0.0002, + "loss": 1.7431, + "step": 810 + }, + { + "epoch": 0.6867671691792295, + "grad_norm": 0.31068870425224304, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 820 + }, + { + "epoch": 0.695142378559464, + "grad_norm": 0.3213907778263092, + "learning_rate": 0.0002, + "loss": 1.8275, + "step": 830 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.2922039330005646, + "learning_rate": 0.0002, + "loss": 1.7975, + "step": 840 + }, + { + "epoch": 0.711892797319933, + "grad_norm": 0.36271268129348755, + "learning_rate": 0.0002, + "loss": 1.817, + "step": 850 + }, + { + "epoch": 0.7202680067001676, + "grad_norm": 0.3195357918739319, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 860 + }, + { + "epoch": 0.7286432160804021, + "grad_norm": 0.31721433997154236, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 870 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.32121971249580383, + "learning_rate": 0.0002, + "loss": 1.832, + "step": 880 + }, + { + "epoch": 0.7453936348408711, + "grad_norm": 0.3149084150791168, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 890 + }, + { + "epoch": 0.7537688442211056, + "grad_norm": 0.38880932331085205, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 900 + }, + { + "epoch": 0.7621440536013401, + "grad_norm": 0.31491366028785706, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 910 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 0.2900884449481964, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 920 + }, + { + "epoch": 0.7788944723618091, + "grad_norm": 0.31911659240722656, + "learning_rate": 0.0002, + "loss": 1.7352, + "step": 930 + }, + { + "epoch": 0.7872696817420436, + "grad_norm": 0.33131274580955505, + "learning_rate": 0.0002, + "loss": 1.8334, + "step": 940 + }, + { + "epoch": 0.7956448911222781, + "grad_norm": 0.2980491816997528, + "learning_rate": 0.0002, + "loss": 1.8077, + "step": 950 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.3282995820045471, + "learning_rate": 0.0002, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.8123953098827471, + "grad_norm": 0.3234929144382477, + "learning_rate": 0.0002, + "loss": 1.7695, + "step": 970 + }, + { + "epoch": 0.8207705192629816, + "grad_norm": 0.31825992465019226, + "learning_rate": 0.0002, + "loss": 1.8491, + "step": 980 + }, + { + "epoch": 0.8291457286432161, + "grad_norm": 0.32733580470085144, + "learning_rate": 0.0002, + "loss": 1.8002, + "step": 990 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 0.3082098066806793, + "learning_rate": 0.0002, + "loss": 1.8407, + "step": 1000 + }, + { + "epoch": 0.8458961474036851, + "grad_norm": 0.32492074370384216, + "learning_rate": 0.0002, + "loss": 1.7784, + "step": 1010 + }, + { + "epoch": 0.8542713567839196, + "grad_norm": 0.3304888904094696, + "learning_rate": 0.0002, + "loss": 1.839, + "step": 1020 + }, + { + "epoch": 0.8626465661641541, + "grad_norm": 0.3304980397224426, + "learning_rate": 0.0002, + "loss": 1.808, + "step": 1030 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.3537079989910126, + "learning_rate": 0.0002, + "loss": 1.8345, + "step": 1040 + }, + { + "epoch": 0.8793969849246231, + "grad_norm": 0.34958404302597046, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 1050 + }, + { + "epoch": 0.8877721943048577, + "grad_norm": 0.34610459208488464, + "learning_rate": 0.0002, + "loss": 1.8036, + "step": 1060 + }, + { + "epoch": 0.8961474036850922, + "grad_norm": 0.35725486278533936, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1070 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.30205485224723816, + "learning_rate": 0.0002, + "loss": 1.7997, + "step": 1080 + }, + { + "epoch": 0.9128978224455612, + "grad_norm": 0.3658352196216583, + "learning_rate": 0.0002, + "loss": 1.7749, + "step": 1090 + }, + { + "epoch": 0.9212730318257957, + "grad_norm": 0.33731144666671753, + "learning_rate": 0.0002, + "loss": 1.7844, + "step": 1100 + }, + { + "epoch": 0.9296482412060302, + "grad_norm": 0.35221847891807556, + "learning_rate": 0.0002, + "loss": 1.8047, + "step": 1110 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.3193749487400055, + "learning_rate": 0.0002, + "loss": 1.7892, + "step": 1120 + }, + { + "epoch": 0.9463986599664992, + "grad_norm": 0.29893460869789124, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1130 + }, + { + "epoch": 0.9547738693467337, + "grad_norm": 0.37168779969215393, + "learning_rate": 0.0002, + "loss": 1.8226, + "step": 1140 + }, + { + "epoch": 0.9631490787269682, + "grad_norm": 0.3465111255645752, + "learning_rate": 0.0002, + "loss": 1.7994, + "step": 1150 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 0.33802181482315063, + "learning_rate": 0.0002, + "loss": 1.8583, + "step": 1160 + }, + { + "epoch": 0.9798994974874372, + "grad_norm": 0.36273202300071716, + "learning_rate": 0.0002, + "loss": 1.8652, + "step": 1170 + }, + { + "epoch": 0.9882747068676717, + "grad_norm": 0.33043375611305237, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 1180 + }, + { + "epoch": 0.9966499162479062, + "grad_norm": 0.3027370870113373, + "learning_rate": 0.0002, + "loss": 1.729, + "step": 1190 + }, + { + "epoch": 1.0, + "eval_loss": 1.8088148832321167, + "eval_runtime": 37.9609, + "eval_samples_per_second": 13.567, + "eval_steps_per_second": 1.712, + "step": 1194 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 0.4256260097026825, + "learning_rate": 0.0002, + "loss": 1.7492, + "step": 1200 + }, + { + "epoch": 1.0134003350083751, + "grad_norm": 0.35050156712532043, + "learning_rate": 0.0002, + "loss": 1.6994, + "step": 1210 + }, + { + "epoch": 1.0217755443886096, + "grad_norm": 0.34773948788642883, + "learning_rate": 0.0002, + "loss": 1.7422, + "step": 1220 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.35487470030784607, + "learning_rate": 0.0002, + "loss": 1.7803, + "step": 1230 + }, + { + "epoch": 1.0385259631490786, + "grad_norm": 0.37040361762046814, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1240 + }, + { + "epoch": 1.0469011725293131, + "grad_norm": 0.33740508556365967, + "learning_rate": 0.0002, + "loss": 1.7663, + "step": 1250 + }, + { + "epoch": 1.0552763819095476, + "grad_norm": 0.3962724506855011, + "learning_rate": 0.0002, + "loss": 1.7485, + "step": 1260 + }, + { + "epoch": 1.0636515912897822, + "grad_norm": 0.3129824101924896, + "learning_rate": 0.0002, + "loss": 1.7334, + "step": 1270 + }, + { + "epoch": 1.0720268006700167, + "grad_norm": 0.3620055019855499, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1280 + }, + { + "epoch": 1.0804020100502512, + "grad_norm": 0.3480982184410095, + "learning_rate": 0.0002, + "loss": 1.7823, + "step": 1290 + }, + { + "epoch": 1.0887772194304857, + "grad_norm": 0.344424843788147, + "learning_rate": 0.0002, + "loss": 1.7081, + "step": 1300 + }, + { + "epoch": 1.0971524288107202, + "grad_norm": 0.3480122685432434, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1310 + }, + { + "epoch": 1.1055276381909547, + "grad_norm": 0.323662132024765, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1320 + }, + { + "epoch": 1.1139028475711892, + "grad_norm": 0.35440102219581604, + "learning_rate": 0.0002, + "loss": 1.7517, + "step": 1330 + }, + { + "epoch": 1.1222780569514237, + "grad_norm": 0.3342263698577881, + "learning_rate": 0.0002, + "loss": 1.7573, + "step": 1340 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.35705259442329407, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 1350 + }, + { + "epoch": 1.1390284757118927, + "grad_norm": 0.38021907210350037, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1360 + }, + { + "epoch": 1.1474036850921272, + "grad_norm": 0.34918731451034546, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 1370 + }, + { + "epoch": 1.1557788944723617, + "grad_norm": 0.371868371963501, + "learning_rate": 0.0002, + "loss": 1.7628, + "step": 1380 + }, + { + "epoch": 1.1641541038525962, + "grad_norm": 0.38413912057876587, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1390 + }, + { + "epoch": 1.1725293132328307, + "grad_norm": 0.3898005187511444, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1400 + }, + { + "epoch": 1.1809045226130652, + "grad_norm": 0.3726498484611511, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 1410 + }, + { + "epoch": 1.1892797319932997, + "grad_norm": 0.3532905876636505, + "learning_rate": 0.0002, + "loss": 1.7379, + "step": 1420 + }, + { + "epoch": 1.1976549413735342, + "grad_norm": 0.338127464056015, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1430 + }, + { + "epoch": 1.2060301507537687, + "grad_norm": 0.3472749888896942, + "learning_rate": 0.0002, + "loss": 1.871, + "step": 1440 + }, + { + "epoch": 1.2144053601340032, + "grad_norm": 0.3523476719856262, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1450 + }, + { + "epoch": 1.2227805695142377, + "grad_norm": 0.42986124753952026, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 1460 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 0.38195517659187317, + "learning_rate": 0.0002, + "loss": 1.7459, + "step": 1470 + }, + { + "epoch": 1.2395309882747068, + "grad_norm": 0.31665122509002686, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 1480 + }, + { + "epoch": 1.2479061976549413, + "grad_norm": 0.3539541959762573, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 1490 + }, + { + "epoch": 1.2562814070351758, + "grad_norm": 0.40162816643714905, + "learning_rate": 0.0002, + "loss": 1.7655, + "step": 1500 + }, + { + "epoch": 1.2646566164154103, + "grad_norm": 0.34727150201797485, + "learning_rate": 0.0002, + "loss": 1.702, + "step": 1510 + }, + { + "epoch": 1.2730318257956448, + "grad_norm": 0.3364993929862976, + "learning_rate": 0.0002, + "loss": 1.7804, + "step": 1520 + }, + { + "epoch": 1.2814070351758793, + "grad_norm": 0.323483943939209, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 1530 + }, + { + "epoch": 1.2897822445561138, + "grad_norm": 0.4114733934402466, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 1540 + }, + { + "epoch": 1.2981574539363483, + "grad_norm": 0.37476620078086853, + "learning_rate": 0.0002, + "loss": 1.6525, + "step": 1550 + }, + { + "epoch": 1.3065326633165828, + "grad_norm": 0.4216269552707672, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1560 + }, + { + "epoch": 1.3149078726968173, + "grad_norm": 0.3204927444458008, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1570 + }, + { + "epoch": 1.3232830820770518, + "grad_norm": 0.36916354298591614, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 1580 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.3755691647529602, + "learning_rate": 0.0002, + "loss": 1.7383, + "step": 1590 + }, + { + "epoch": 1.3400335008375208, + "grad_norm": 0.3688889443874359, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 1600 + }, + { + "epoch": 1.3484087102177553, + "grad_norm": 0.34306398034095764, + "learning_rate": 0.0002, + "loss": 1.7664, + "step": 1610 + }, + { + "epoch": 1.3567839195979898, + "grad_norm": 0.3651525676250458, + "learning_rate": 0.0002, + "loss": 1.6943, + "step": 1620 + }, + { + "epoch": 1.3651591289782243, + "grad_norm": 0.3461526036262512, + "learning_rate": 0.0002, + "loss": 1.7206, + "step": 1630 + }, + { + "epoch": 1.3735343383584588, + "grad_norm": 0.37959185242652893, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 1640 + }, + { + "epoch": 1.3819095477386933, + "grad_norm": 0.4005356431007385, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 1650 + }, + { + "epoch": 1.3902847571189278, + "grad_norm": 0.3537434935569763, + "learning_rate": 0.0002, + "loss": 1.694, + "step": 1660 + }, + { + "epoch": 1.3986599664991624, + "grad_norm": 0.38220855593681335, + "learning_rate": 0.0002, + "loss": 1.6679, + "step": 1670 + }, + { + "epoch": 1.4070351758793969, + "grad_norm": 0.3573434352874756, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1680 + }, + { + "epoch": 1.4154103852596314, + "grad_norm": 0.40028059482574463, + "learning_rate": 0.0002, + "loss": 1.6983, + "step": 1690 + }, + { + "epoch": 1.4237855946398659, + "grad_norm": 0.3953610360622406, + "learning_rate": 0.0002, + "loss": 1.7049, + "step": 1700 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.39524543285369873, + "learning_rate": 0.0002, + "loss": 1.7126, + "step": 1710 + }, + { + "epoch": 1.4405360134003349, + "grad_norm": 0.37721359729766846, + "learning_rate": 0.0002, + "loss": 1.8319, + "step": 1720 + }, + { + "epoch": 1.4489112227805694, + "grad_norm": 0.4220093786716461, + "learning_rate": 0.0002, + "loss": 1.7387, + "step": 1730 + }, + { + "epoch": 1.457286432160804, + "grad_norm": 0.3876369595527649, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 1740 + }, + { + "epoch": 1.4656616415410384, + "grad_norm": 0.3774619400501251, + "learning_rate": 0.0002, + "loss": 1.6859, + "step": 1750 + }, + { + "epoch": 1.474036850921273, + "grad_norm": 0.3608052432537079, + "learning_rate": 0.0002, + "loss": 1.7223, + "step": 1760 + }, + { + "epoch": 1.4824120603015074, + "grad_norm": 0.32083916664123535, + "learning_rate": 0.0002, + "loss": 1.6746, + "step": 1770 + }, + { + "epoch": 1.490787269681742, + "grad_norm": 0.32290884852409363, + "learning_rate": 0.0002, + "loss": 1.716, + "step": 1780 + }, + { + "epoch": 1.4991624790619764, + "grad_norm": 0.3537974953651428, + "learning_rate": 0.0002, + "loss": 1.7648, + "step": 1790 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 0.36576104164123535, + "learning_rate": 0.0002, + "loss": 1.6784, + "step": 1800 + }, + { + "epoch": 1.5159128978224454, + "grad_norm": 0.3336752653121948, + "learning_rate": 0.0002, + "loss": 1.6818, + "step": 1810 + }, + { + "epoch": 1.52428810720268, + "grad_norm": 0.3551652431488037, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1820 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.43313586711883545, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 1830 + }, + { + "epoch": 1.541038525963149, + "grad_norm": 0.39160311222076416, + "learning_rate": 0.0002, + "loss": 1.7358, + "step": 1840 + }, + { + "epoch": 1.5494137353433834, + "grad_norm": 0.38758179545402527, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 1850 + }, + { + "epoch": 1.557788944723618, + "grad_norm": 0.3658832013607025, + "learning_rate": 0.0002, + "loss": 1.7768, + "step": 1860 + }, + { + "epoch": 1.5661641541038525, + "grad_norm": 0.375372052192688, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 1870 + }, + { + "epoch": 1.574539363484087, + "grad_norm": 0.3586942255496979, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1880 + }, + { + "epoch": 1.5829145728643215, + "grad_norm": 0.3626467287540436, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1890 + }, + { + "epoch": 1.591289782244556, + "grad_norm": 0.4199363589286804, + "learning_rate": 0.0002, + "loss": 1.7943, + "step": 1900 + }, + { + "epoch": 1.5996649916247905, + "grad_norm": 0.35646331310272217, + "learning_rate": 0.0002, + "loss": 1.6551, + "step": 1910 + }, + { + "epoch": 1.608040201005025, + "grad_norm": 0.3465106189250946, + "learning_rate": 0.0002, + "loss": 1.7125, + "step": 1920 + }, + { + "epoch": 1.6164154103852595, + "grad_norm": 0.43392884731292725, + "learning_rate": 0.0002, + "loss": 1.8507, + "step": 1930 + }, + { + "epoch": 1.624790619765494, + "grad_norm": 0.39187198877334595, + "learning_rate": 0.0002, + "loss": 1.7009, + "step": 1940 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.3685080409049988, + "learning_rate": 0.0002, + "loss": 1.7202, + "step": 1950 + }, + { + "epoch": 1.641541038525963, + "grad_norm": 0.4044491946697235, + "learning_rate": 0.0002, + "loss": 1.6607, + "step": 1960 + }, + { + "epoch": 1.6499162479061975, + "grad_norm": 0.4388049244880676, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 1970 + }, + { + "epoch": 1.658291457286432, + "grad_norm": 0.36165162920951843, + "learning_rate": 0.0002, + "loss": 1.7178, + "step": 1980 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3501148521900177, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 1990 + }, + { + "epoch": 1.675041876046901, + "grad_norm": 0.3751881718635559, + "learning_rate": 0.0002, + "loss": 1.7057, + "step": 2000 + }, + { + "epoch": 1.6834170854271355, + "grad_norm": 0.3902788460254669, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 2010 + }, + { + "epoch": 1.69179229480737, + "grad_norm": 0.39642134308815, + "learning_rate": 0.0002, + "loss": 1.8517, + "step": 2020 + }, + { + "epoch": 1.7001675041876045, + "grad_norm": 0.35721203684806824, + "learning_rate": 0.0002, + "loss": 1.6623, + "step": 2030 + }, + { + "epoch": 1.708542713567839, + "grad_norm": 0.360419899225235, + "learning_rate": 0.0002, + "loss": 1.6988, + "step": 2040 + }, + { + "epoch": 1.7169179229480735, + "grad_norm": 0.3755600154399872, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 2050 + }, + { + "epoch": 1.725293132328308, + "grad_norm": 0.3939184844493866, + "learning_rate": 0.0002, + "loss": 1.6726, + "step": 2060 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.33955490589141846, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 2070 + }, + { + "epoch": 1.742043551088777, + "grad_norm": 0.35501939058303833, + "learning_rate": 0.0002, + "loss": 1.6794, + "step": 2080 + }, + { + "epoch": 1.7504187604690116, + "grad_norm": 0.38298022747039795, + "learning_rate": 0.0002, + "loss": 1.7312, + "step": 2090 + }, + { + "epoch": 1.758793969849246, + "grad_norm": 0.3472785949707031, + "learning_rate": 0.0002, + "loss": 1.6602, + "step": 2100 + }, + { + "epoch": 1.7671691792294806, + "grad_norm": 0.3620430827140808, + "learning_rate": 0.0002, + "loss": 1.6671, + "step": 2110 + }, + { + "epoch": 1.775544388609715, + "grad_norm": 0.3795909881591797, + "learning_rate": 0.0002, + "loss": 1.671, + "step": 2120 + }, + { + "epoch": 1.7839195979899496, + "grad_norm": 0.3662523925304413, + "learning_rate": 0.0002, + "loss": 1.7193, + "step": 2130 + }, + { + "epoch": 1.792294807370184, + "grad_norm": 0.4113886058330536, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 2140 + }, + { + "epoch": 1.8006700167504186, + "grad_norm": 0.3765672743320465, + "learning_rate": 0.0002, + "loss": 1.6681, + "step": 2150 + }, + { + "epoch": 1.809045226130653, + "grad_norm": 0.41623714566230774, + "learning_rate": 0.0002, + "loss": 1.7481, + "step": 2160 + }, + { + "epoch": 1.8174204355108876, + "grad_norm": 0.3724099099636078, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 2170 + }, + { + "epoch": 1.8257956448911221, + "grad_norm": 0.3990779221057892, + "learning_rate": 0.0002, + "loss": 1.6912, + "step": 2180 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.3677702844142914, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 2190 + }, + { + "epoch": 1.8425460636515911, + "grad_norm": 0.3944959342479706, + "learning_rate": 0.0002, + "loss": 1.6705, + "step": 2200 + }, + { + "epoch": 1.8509212730318256, + "grad_norm": 0.3413957357406616, + "learning_rate": 0.0002, + "loss": 1.7619, + "step": 2210 + }, + { + "epoch": 1.8592964824120601, + "grad_norm": 0.40136098861694336, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 2220 + }, + { + "epoch": 1.8676716917922946, + "grad_norm": 0.3496319055557251, + "learning_rate": 0.0002, + "loss": 1.6865, + "step": 2230 + }, + { + "epoch": 1.8760469011725294, + "grad_norm": 0.3759860694408417, + "learning_rate": 0.0002, + "loss": 1.6906, + "step": 2240 + }, + { + "epoch": 1.8844221105527639, + "grad_norm": 0.43556007742881775, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 2250 + }, + { + "epoch": 1.8927973199329984, + "grad_norm": 0.3864828944206238, + "learning_rate": 0.0002, + "loss": 1.66, + "step": 2260 + }, + { + "epoch": 1.9011725293132329, + "grad_norm": 0.396930456161499, + "learning_rate": 0.0002, + "loss": 1.6502, + "step": 2270 + }, + { + "epoch": 1.9095477386934674, + "grad_norm": 0.37667879462242126, + "learning_rate": 0.0002, + "loss": 1.838, + "step": 2280 + }, + { + "epoch": 1.917922948073702, + "grad_norm": 0.3539164066314697, + "learning_rate": 0.0002, + "loss": 1.7315, + "step": 2290 + }, + { + "epoch": 1.9262981574539364, + "grad_norm": 0.40542101860046387, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 2300 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.37341606616973877, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 2310 + }, + { + "epoch": 1.9430485762144054, + "grad_norm": 0.4011504352092743, + "learning_rate": 0.0002, + "loss": 1.7058, + "step": 2320 + }, + { + "epoch": 1.95142378559464, + "grad_norm": 0.37934592366218567, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 2330 + }, + { + "epoch": 1.9597989949748744, + "grad_norm": 0.32745009660720825, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 2340 + }, + { + "epoch": 1.968174204355109, + "grad_norm": 0.38347750902175903, + "learning_rate": 0.0002, + "loss": 1.7673, + "step": 2350 + }, + { + "epoch": 1.9765494137353434, + "grad_norm": 0.3945120871067047, + "learning_rate": 0.0002, + "loss": 1.7116, + "step": 2360 + }, + { + "epoch": 1.984924623115578, + "grad_norm": 0.4034058749675751, + "learning_rate": 0.0002, + "loss": 1.7559, + "step": 2370 + }, + { + "epoch": 1.9932998324958124, + "grad_norm": 0.3546718955039978, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.8061236143112183, + "eval_runtime": 38.2113, + "eval_samples_per_second": 13.478, + "eval_steps_per_second": 1.701, + "step": 2388 + }, + { + "epoch": 2.0016750418760467, + "grad_norm": 0.35184019804000854, + "learning_rate": 0.0002, + "loss": 1.7203, + "step": 2390 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 0.40416669845581055, + "learning_rate": 0.0002, + "loss": 1.6124, + "step": 2400 + }, + { + "epoch": 2.0184254606365157, + "grad_norm": 0.3824569880962372, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2410 + }, + { + "epoch": 2.0268006700167502, + "grad_norm": 0.42036163806915283, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2420 + }, + { + "epoch": 2.0351758793969847, + "grad_norm": 0.40417996048927307, + "learning_rate": 0.0002, + "loss": 1.6176, + "step": 2430 + }, + { + "epoch": 2.0435510887772192, + "grad_norm": 0.45298922061920166, + "learning_rate": 0.0002, + "loss": 1.643, + "step": 2440 + }, + { + "epoch": 2.0519262981574538, + "grad_norm": 0.48289841413497925, + "learning_rate": 0.0002, + "loss": 1.653, + "step": 2450 + }, + { + "epoch": 2.0603015075376883, + "grad_norm": 0.43702399730682373, + "learning_rate": 0.0002, + "loss": 1.5275, + "step": 2460 + }, + { + "epoch": 2.0686767169179228, + "grad_norm": 0.49487054347991943, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2470 + }, + { + "epoch": 2.0770519262981573, + "grad_norm": 0.40030500292778015, + "learning_rate": 0.0002, + "loss": 1.6552, + "step": 2480 + }, + { + "epoch": 2.0854271356783918, + "grad_norm": 0.4664880037307739, + "learning_rate": 0.0002, + "loss": 1.614, + "step": 2490 + }, + { + "epoch": 2.0938023450586263, + "grad_norm": 0.4111400842666626, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 2500 + }, + { + "epoch": 2.102177554438861, + "grad_norm": 0.4155750572681427, + "learning_rate": 0.0002, + "loss": 1.5788, + "step": 2510 + }, + { + "epoch": 2.1105527638190953, + "grad_norm": 0.39257505536079407, + "learning_rate": 0.0002, + "loss": 1.598, + "step": 2520 + }, + { + "epoch": 2.11892797319933, + "grad_norm": 0.4156777560710907, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 2530 + }, + { + "epoch": 2.1273031825795643, + "grad_norm": 0.4025181233882904, + "learning_rate": 0.0002, + "loss": 1.6695, + "step": 2540 + }, + { + "epoch": 2.135678391959799, + "grad_norm": 0.42347562313079834, + "learning_rate": 0.0002, + "loss": 1.6471, + "step": 2550 + }, + { + "epoch": 2.1440536013400333, + "grad_norm": 0.47068294882774353, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 2560 + }, + { + "epoch": 2.152428810720268, + "grad_norm": 0.44081777334213257, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 2570 + }, + { + "epoch": 2.1608040201005023, + "grad_norm": 0.44823798537254333, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 2580 + }, + { + "epoch": 2.169179229480737, + "grad_norm": 0.40486326813697815, + "learning_rate": 0.0002, + "loss": 1.6287, + "step": 2590 + }, + { + "epoch": 2.1775544388609713, + "grad_norm": 0.454236775636673, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 2600 + }, + { + "epoch": 2.185929648241206, + "grad_norm": 0.42555344104766846, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 2610 + }, + { + "epoch": 2.1943048576214403, + "grad_norm": 0.5607381463050842, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2620 + }, + { + "epoch": 2.202680067001675, + "grad_norm": 0.4095611870288849, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 2630 + }, + { + "epoch": 2.2110552763819094, + "grad_norm": 0.419342577457428, + "learning_rate": 0.0002, + "loss": 1.5584, + "step": 2640 + }, + { + "epoch": 2.219430485762144, + "grad_norm": 0.48541849851608276, + "learning_rate": 0.0002, + "loss": 1.5425, + "step": 2650 + }, + { + "epoch": 2.2278056951423784, + "grad_norm": 0.4365246891975403, + "learning_rate": 0.0002, + "loss": 1.6233, + "step": 2660 + }, + { + "epoch": 2.236180904522613, + "grad_norm": 0.46417000889778137, + "learning_rate": 0.0002, + "loss": 1.6886, + "step": 2670 + }, + { + "epoch": 2.2445561139028474, + "grad_norm": 0.5034580230712891, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 2680 + }, + { + "epoch": 2.2529313232830823, + "grad_norm": 0.44852879643440247, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 2690 + }, + { + "epoch": 2.2613065326633164, + "grad_norm": 0.43886998295783997, + "learning_rate": 0.0002, + "loss": 1.6152, + "step": 2700 + }, + { + "epoch": 2.2696817420435513, + "grad_norm": 0.45762625336647034, + "learning_rate": 0.0002, + "loss": 1.6533, + "step": 2710 + }, + { + "epoch": 2.2780569514237854, + "grad_norm": 0.39429017901420593, + "learning_rate": 0.0002, + "loss": 1.5889, + "step": 2720 + }, + { + "epoch": 2.2864321608040203, + "grad_norm": 0.4420442581176758, + "learning_rate": 0.0002, + "loss": 1.6419, + "step": 2730 + }, + { + "epoch": 2.2948073701842544, + "grad_norm": 0.4327794015407562, + "learning_rate": 0.0002, + "loss": 1.6126, + "step": 2740 + }, + { + "epoch": 2.3031825795644894, + "grad_norm": 0.4303780198097229, + "learning_rate": 0.0002, + "loss": 1.6405, + "step": 2750 + }, + { + "epoch": 2.3115577889447234, + "grad_norm": 0.41379377245903015, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 2760 + }, + { + "epoch": 2.3199329983249584, + "grad_norm": 0.4821205735206604, + "learning_rate": 0.0002, + "loss": 1.6744, + "step": 2770 + }, + { + "epoch": 2.3283082077051924, + "grad_norm": 0.46232181787490845, + "learning_rate": 0.0002, + "loss": 1.6694, + "step": 2780 + }, + { + "epoch": 2.3366834170854274, + "grad_norm": 0.44937554001808167, + "learning_rate": 0.0002, + "loss": 1.6341, + "step": 2790 + }, + { + "epoch": 2.3450586264656614, + "grad_norm": 0.443250447511673, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2800 + }, + { + "epoch": 2.3534338358458964, + "grad_norm": 0.4687805473804474, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2810 + }, + { + "epoch": 2.3618090452261304, + "grad_norm": 0.435031920671463, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2820 + }, + { + "epoch": 2.3701842546063654, + "grad_norm": 0.4949858784675598, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 2830 + }, + { + "epoch": 2.3785594639865995, + "grad_norm": 0.46349018812179565, + "learning_rate": 0.0002, + "loss": 1.6803, + "step": 2840 + }, + { + "epoch": 2.3869346733668344, + "grad_norm": 0.46377238631248474, + "learning_rate": 0.0002, + "loss": 1.6586, + "step": 2850 + }, + { + "epoch": 2.3953098827470685, + "grad_norm": 0.6111940741539001, + "learning_rate": 0.0002, + "loss": 1.5384, + "step": 2860 + }, + { + "epoch": 2.4036850921273034, + "grad_norm": 0.45090532302856445, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2870 + }, + { + "epoch": 2.4120603015075375, + "grad_norm": 0.4762120842933655, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2880 + }, + { + "epoch": 2.4204355108877724, + "grad_norm": 0.4397919774055481, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 2890 + }, + { + "epoch": 2.4288107202680065, + "grad_norm": 0.4765152335166931, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2900 + }, + { + "epoch": 2.4371859296482414, + "grad_norm": 0.4347304403781891, + "learning_rate": 0.0002, + "loss": 1.5982, + "step": 2910 + }, + { + "epoch": 2.4455611390284755, + "grad_norm": 0.3918324410915375, + "learning_rate": 0.0002, + "loss": 1.6409, + "step": 2920 + }, + { + "epoch": 2.4539363484087104, + "grad_norm": 0.43932855129241943, + "learning_rate": 0.0002, + "loss": 1.5354, + "step": 2930 + }, + { + "epoch": 2.4623115577889445, + "grad_norm": 0.46946918964385986, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 2940 + }, + { + "epoch": 2.4706867671691795, + "grad_norm": 0.45169174671173096, + "learning_rate": 0.0002, + "loss": 1.6622, + "step": 2950 + }, + { + "epoch": 2.4790619765494135, + "grad_norm": 0.43488186597824097, + "learning_rate": 0.0002, + "loss": 1.6386, + "step": 2960 + }, + { + "epoch": 2.4874371859296485, + "grad_norm": 0.42297765612602234, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 2970 + }, + { + "epoch": 2.4958123953098825, + "grad_norm": 0.4546392560005188, + "learning_rate": 0.0002, + "loss": 1.5708, + "step": 2980 + }, + { + "epoch": 2.5041876046901175, + "grad_norm": 0.4236692488193512, + "learning_rate": 0.0002, + "loss": 1.5944, + "step": 2990 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 0.46421024203300476, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 3000 + }, + { + "epoch": 2.5209380234505865, + "grad_norm": 0.5040220618247986, + "learning_rate": 0.0002, + "loss": 1.6686, + "step": 3010 + }, + { + "epoch": 2.5293132328308205, + "grad_norm": 0.4596138894557953, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.5376884422110555, + "grad_norm": 0.4410228729248047, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3030 + }, + { + "epoch": 2.5460636515912896, + "grad_norm": 0.553693413734436, + "learning_rate": 0.0002, + "loss": 1.6336, + "step": 3040 + }, + { + "epoch": 2.5544388609715245, + "grad_norm": 0.41298043727874756, + "learning_rate": 0.0002, + "loss": 1.6377, + "step": 3050 + }, + { + "epoch": 2.5628140703517586, + "grad_norm": 0.4894513487815857, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 3060 + }, + { + "epoch": 2.5711892797319935, + "grad_norm": 0.5525603294372559, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3070 + }, + { + "epoch": 2.5795644891122276, + "grad_norm": 0.5043630003929138, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 3080 + }, + { + "epoch": 2.5879396984924625, + "grad_norm": 0.4690920412540436, + "learning_rate": 0.0002, + "loss": 1.5641, + "step": 3090 + }, + { + "epoch": 2.5963149078726966, + "grad_norm": 0.4358677566051483, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 3100 + }, + { + "epoch": 2.6046901172529315, + "grad_norm": 0.4621894061565399, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 3110 + }, + { + "epoch": 2.6130653266331656, + "grad_norm": 0.4639507532119751, + "learning_rate": 0.0002, + "loss": 1.7426, + "step": 3120 + }, + { + "epoch": 2.6214405360134005, + "grad_norm": 0.45161309838294983, + "learning_rate": 0.0002, + "loss": 1.6492, + "step": 3130 + }, + { + "epoch": 2.6298157453936346, + "grad_norm": 0.49179261922836304, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 3140 + }, + { + "epoch": 2.6381909547738696, + "grad_norm": 0.4739720821380615, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 3150 + }, + { + "epoch": 2.6465661641541036, + "grad_norm": 0.468252956867218, + "learning_rate": 0.0002, + "loss": 1.616, + "step": 3160 + }, + { + "epoch": 2.6549413735343386, + "grad_norm": 0.44691553711891174, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 3170 + }, + { + "epoch": 2.6633165829145726, + "grad_norm": 0.47537046670913696, + "learning_rate": 0.0002, + "loss": 1.6558, + "step": 3180 + }, + { + "epoch": 2.6716917922948076, + "grad_norm": 0.4445202052593231, + "learning_rate": 0.0002, + "loss": 1.6755, + "step": 3190 + }, + { + "epoch": 2.6800670016750416, + "grad_norm": 0.46785518527030945, + "learning_rate": 0.0002, + "loss": 1.6522, + "step": 3200 + }, + { + "epoch": 2.6884422110552766, + "grad_norm": 0.4807088077068329, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 3210 + }, + { + "epoch": 2.6968174204355106, + "grad_norm": 0.4547516703605652, + "learning_rate": 0.0002, + "loss": 1.6385, + "step": 3220 + }, + { + "epoch": 2.7051926298157456, + "grad_norm": 0.5200821161270142, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 3230 + }, + { + "epoch": 2.7135678391959797, + "grad_norm": 0.4915551245212555, + "learning_rate": 0.0002, + "loss": 1.6434, + "step": 3240 + }, + { + "epoch": 2.7219430485762146, + "grad_norm": 0.4324817955493927, + "learning_rate": 0.0002, + "loss": 1.6146, + "step": 3250 + }, + { + "epoch": 2.7303182579564487, + "grad_norm": 0.6290464997291565, + "learning_rate": 0.0002, + "loss": 1.6154, + "step": 3260 + }, + { + "epoch": 2.7386934673366836, + "grad_norm": 0.42255541682243347, + "learning_rate": 0.0002, + "loss": 1.611, + "step": 3270 + }, + { + "epoch": 2.7470686767169177, + "grad_norm": 0.47089505195617676, + "learning_rate": 0.0002, + "loss": 1.6345, + "step": 3280 + }, + { + "epoch": 2.7554438860971526, + "grad_norm": 0.4492960572242737, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3290 + }, + { + "epoch": 2.7638190954773867, + "grad_norm": 0.4711938202381134, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 3300 + }, + { + "epoch": 2.7721943048576216, + "grad_norm": 0.4635316729545593, + "learning_rate": 0.0002, + "loss": 1.6107, + "step": 3310 + }, + { + "epoch": 2.7805695142378557, + "grad_norm": 0.4207742512226105, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 3320 + }, + { + "epoch": 2.7889447236180906, + "grad_norm": 0.5545504093170166, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 3330 + }, + { + "epoch": 2.7973199329983247, + "grad_norm": 0.46976953744888306, + "learning_rate": 0.0002, + "loss": 1.6642, + "step": 3340 + }, + { + "epoch": 2.8056951423785597, + "grad_norm": 0.4805937111377716, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 3350 + }, + { + "epoch": 2.8140703517587937, + "grad_norm": 0.4986467659473419, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 3360 + }, + { + "epoch": 2.8224455611390287, + "grad_norm": 0.44702932238578796, + "learning_rate": 0.0002, + "loss": 1.6125, + "step": 3370 + }, + { + "epoch": 2.8308207705192627, + "grad_norm": 0.4698854088783264, + "learning_rate": 0.0002, + "loss": 1.6318, + "step": 3380 + }, + { + "epoch": 2.8391959798994977, + "grad_norm": 0.5756528377532959, + "learning_rate": 0.0002, + "loss": 1.6468, + "step": 3390 + }, + { + "epoch": 2.8475711892797317, + "grad_norm": 0.4266531765460968, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 3400 + }, + { + "epoch": 2.8559463986599667, + "grad_norm": 0.5342442989349365, + "learning_rate": 0.0002, + "loss": 1.6351, + "step": 3410 + }, + { + "epoch": 2.8643216080402008, + "grad_norm": 0.47210443019866943, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 3420 + }, + { + "epoch": 2.8726968174204357, + "grad_norm": 0.4491795599460602, + "learning_rate": 0.0002, + "loss": 1.6157, + "step": 3430 + }, + { + "epoch": 2.8810720268006698, + "grad_norm": 0.5387647151947021, + "learning_rate": 0.0002, + "loss": 1.6179, + "step": 3440 + }, + { + "epoch": 2.8894472361809047, + "grad_norm": 0.5059208273887634, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3450 + }, + { + "epoch": 2.8978224455611388, + "grad_norm": 0.472605437040329, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3460 + }, + { + "epoch": 2.9061976549413737, + "grad_norm": 0.499795138835907, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 3470 + }, + { + "epoch": 2.914572864321608, + "grad_norm": 0.4887969493865967, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 3480 + }, + { + "epoch": 2.9229480737018427, + "grad_norm": 0.4670022130012512, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3490 + }, + { + "epoch": 2.931323283082077, + "grad_norm": 0.4475444555282593, + "learning_rate": 0.0002, + "loss": 1.6355, + "step": 3500 + }, + { + "epoch": 2.9396984924623117, + "grad_norm": 0.39244669675827026, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 3510 + }, + { + "epoch": 2.948073701842546, + "grad_norm": 0.4905056059360504, + "learning_rate": 0.0002, + "loss": 1.6094, + "step": 3520 + }, + { + "epoch": 2.9564489112227808, + "grad_norm": 0.4395551085472107, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 3530 + }, + { + "epoch": 2.964824120603015, + "grad_norm": 0.4693661034107208, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3540 + }, + { + "epoch": 2.9731993299832498, + "grad_norm": 0.473781943321228, + "learning_rate": 0.0002, + "loss": 1.648, + "step": 3550 + }, + { + "epoch": 2.981574539363484, + "grad_norm": 0.4374050796031952, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3560 + }, + { + "epoch": 2.9899497487437188, + "grad_norm": 0.46144190430641174, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 3570 + }, + { + "epoch": 2.998324958123953, + "grad_norm": 0.43887680768966675, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 3580 + }, + { + "epoch": 3.0, + "eval_loss": 1.8283122777938843, + "eval_runtime": 38.023, + "eval_samples_per_second": 13.544, + "eval_steps_per_second": 1.709, + "step": 3582 + }, + { + "epoch": 3.006700167504188, + "grad_norm": 0.6784713268280029, + "learning_rate": 0.0002, + "loss": 1.5874, + "step": 3590 + }, + { + "epoch": 3.0150753768844223, + "grad_norm": 0.5783940553665161, + "learning_rate": 0.0002, + "loss": 1.5813, + "step": 3600 + }, + { + "epoch": 3.023450586264657, + "grad_norm": 0.5408937335014343, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 3610 + }, + { + "epoch": 3.0318257956448913, + "grad_norm": 0.5229013562202454, + "learning_rate": 0.0002, + "loss": 1.526, + "step": 3620 + }, + { + "epoch": 3.040201005025126, + "grad_norm": 0.49160143733024597, + "learning_rate": 0.0002, + "loss": 1.4835, + "step": 3630 + }, + { + "epoch": 3.0485762144053603, + "grad_norm": 0.6563201546669006, + "learning_rate": 0.0002, + "loss": 1.5398, + "step": 3640 + }, + { + "epoch": 3.056951423785595, + "grad_norm": 0.5686020851135254, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3650 + }, + { + "epoch": 3.0653266331658293, + "grad_norm": 0.5774043202400208, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3660 + }, + { + "epoch": 3.073701842546064, + "grad_norm": 0.6106171011924744, + "learning_rate": 0.0002, + "loss": 1.4734, + "step": 3670 + }, + { + "epoch": 3.0820770519262983, + "grad_norm": 0.517433226108551, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3680 + }, + { + "epoch": 3.090452261306533, + "grad_norm": 0.5681702494621277, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 3690 + }, + { + "epoch": 3.0988274706867673, + "grad_norm": 0.5769233107566833, + "learning_rate": 0.0002, + "loss": 1.4731, + "step": 3700 + }, + { + "epoch": 3.107202680067002, + "grad_norm": 0.5657462477684021, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 3710 + }, + { + "epoch": 3.1155778894472363, + "grad_norm": 0.6035246253013611, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3720 + }, + { + "epoch": 3.123953098827471, + "grad_norm": 0.7286643385887146, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 3730 + }, + { + "epoch": 3.1323283082077054, + "grad_norm": 0.5121201872825623, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 3740 + }, + { + "epoch": 3.14070351758794, + "grad_norm": 0.5074213147163391, + "learning_rate": 0.0002, + "loss": 1.565, + "step": 3750 + }, + { + "epoch": 3.1490787269681744, + "grad_norm": 0.57481849193573, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 3760 + }, + { + "epoch": 3.157453936348409, + "grad_norm": 0.6326663494110107, + "learning_rate": 0.0002, + "loss": 1.4765, + "step": 3770 + }, + { + "epoch": 3.1658291457286434, + "grad_norm": 0.6039315462112427, + "learning_rate": 0.0002, + "loss": 1.4888, + "step": 3780 + }, + { + "epoch": 3.174204355108878, + "grad_norm": 0.6936715245246887, + "learning_rate": 0.0002, + "loss": 1.5084, + "step": 3790 + }, + { + "epoch": 3.1825795644891124, + "grad_norm": 0.6516796946525574, + "learning_rate": 0.0002, + "loss": 1.4879, + "step": 3800 + }, + { + "epoch": 3.190954773869347, + "grad_norm": 0.6140730977058411, + "learning_rate": 0.0002, + "loss": 1.578, + "step": 3810 + }, + { + "epoch": 3.1993299832495814, + "grad_norm": 0.631328284740448, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3820 + }, + { + "epoch": 3.207705192629816, + "grad_norm": 0.6265402436256409, + "learning_rate": 0.0002, + "loss": 1.4844, + "step": 3830 + }, + { + "epoch": 3.2160804020100504, + "grad_norm": 0.6649428606033325, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 3840 + }, + { + "epoch": 3.224455611390285, + "grad_norm": 0.5329259634017944, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 3850 + }, + { + "epoch": 3.2328308207705194, + "grad_norm": 0.6008304953575134, + "learning_rate": 0.0002, + "loss": 1.5714, + "step": 3860 + }, + { + "epoch": 3.241206030150754, + "grad_norm": 0.5918582081794739, + "learning_rate": 0.0002, + "loss": 1.5214, + "step": 3870 + }, + { + "epoch": 3.2495812395309884, + "grad_norm": 0.643622100353241, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3880 + }, + { + "epoch": 3.257956448911223, + "grad_norm": 0.5517964363098145, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 3890 + }, + { + "epoch": 3.2663316582914574, + "grad_norm": 0.6780755519866943, + "learning_rate": 0.0002, + "loss": 1.5458, + "step": 3900 + }, + { + "epoch": 3.274706867671692, + "grad_norm": 0.6742202639579773, + "learning_rate": 0.0002, + "loss": 1.5743, + "step": 3910 + }, + { + "epoch": 3.2830820770519265, + "grad_norm": 0.6228749752044678, + "learning_rate": 0.0002, + "loss": 1.5279, + "step": 3920 + }, + { + "epoch": 3.291457286432161, + "grad_norm": 0.5836303234100342, + "learning_rate": 0.0002, + "loss": 1.4899, + "step": 3930 + }, + { + "epoch": 3.2998324958123955, + "grad_norm": 0.6337724328041077, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 3940 + }, + { + "epoch": 3.30820770519263, + "grad_norm": 0.6345084309577942, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 3950 + }, + { + "epoch": 3.3165829145728645, + "grad_norm": 0.6125303506851196, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3960 + }, + { + "epoch": 3.324958123953099, + "grad_norm": 0.6259911060333252, + "learning_rate": 0.0002, + "loss": 1.5355, + "step": 3970 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.645745575428009, + "learning_rate": 0.0002, + "loss": 1.5427, + "step": 3980 + }, + { + "epoch": 3.341708542713568, + "grad_norm": 0.6666176915168762, + "learning_rate": 0.0002, + "loss": 1.5817, + "step": 3990 + }, + { + "epoch": 3.3500837520938025, + "grad_norm": 0.59013831615448, + "learning_rate": 0.0002, + "loss": 1.4998, + "step": 4000 + }, + { + "epoch": 3.358458961474037, + "grad_norm": 0.6604634523391724, + "learning_rate": 0.0002, + "loss": 1.4921, + "step": 4010 + }, + { + "epoch": 3.3668341708542715, + "grad_norm": 0.6676120758056641, + "learning_rate": 0.0002, + "loss": 1.5076, + "step": 4020 + }, + { + "epoch": 3.375209380234506, + "grad_norm": 0.515724778175354, + "learning_rate": 0.0002, + "loss": 1.4801, + "step": 4030 + }, + { + "epoch": 3.3835845896147405, + "grad_norm": 0.681968092918396, + "learning_rate": 0.0002, + "loss": 1.4932, + "step": 4040 + }, + { + "epoch": 3.391959798994975, + "grad_norm": 0.5978158116340637, + "learning_rate": 0.0002, + "loss": 1.5148, + "step": 4050 + }, + { + "epoch": 3.4003350083752095, + "grad_norm": 0.6043432354927063, + "learning_rate": 0.0002, + "loss": 1.5449, + "step": 4060 + }, + { + "epoch": 3.408710217755444, + "grad_norm": 0.5899770855903625, + "learning_rate": 0.0002, + "loss": 1.5021, + "step": 4070 + }, + { + "epoch": 3.4170854271356785, + "grad_norm": 0.6014242172241211, + "learning_rate": 0.0002, + "loss": 1.5992, + "step": 4080 + }, + { + "epoch": 3.425460636515913, + "grad_norm": 0.5944811105728149, + "learning_rate": 0.0002, + "loss": 1.4692, + "step": 4090 + }, + { + "epoch": 3.4338358458961475, + "grad_norm": 0.6506822109222412, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 4100 + }, + { + "epoch": 3.442211055276382, + "grad_norm": 0.6926528811454773, + "learning_rate": 0.0002, + "loss": 1.5144, + "step": 4110 + }, + { + "epoch": 3.4505862646566166, + "grad_norm": 0.5646378993988037, + "learning_rate": 0.0002, + "loss": 1.5169, + "step": 4120 + }, + { + "epoch": 3.458961474036851, + "grad_norm": 0.7233654856681824, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 4130 + }, + { + "epoch": 3.4673366834170856, + "grad_norm": 0.6231815814971924, + "learning_rate": 0.0002, + "loss": 1.5161, + "step": 4140 + }, + { + "epoch": 3.47571189279732, + "grad_norm": 0.6115689873695374, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 4150 + }, + { + "epoch": 3.4840871021775546, + "grad_norm": 0.5812674760818481, + "learning_rate": 0.0002, + "loss": 1.4621, + "step": 4160 + }, + { + "epoch": 3.492462311557789, + "grad_norm": 0.6099632978439331, + "learning_rate": 0.0002, + "loss": 1.5465, + "step": 4170 + }, + { + "epoch": 3.5008375209380236, + "grad_norm": 0.6102647185325623, + "learning_rate": 0.0002, + "loss": 1.4795, + "step": 4180 + }, + { + "epoch": 3.509212730318258, + "grad_norm": 0.6034680008888245, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 4190 + }, + { + "epoch": 3.5175879396984926, + "grad_norm": 0.6281666159629822, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 4200 + }, + { + "epoch": 3.525963149078727, + "grad_norm": 0.6245372295379639, + "learning_rate": 0.0002, + "loss": 1.4903, + "step": 4210 + }, + { + "epoch": 3.5343383584589616, + "grad_norm": 0.5897293090820312, + "learning_rate": 0.0002, + "loss": 1.5098, + "step": 4220 + }, + { + "epoch": 3.542713567839196, + "grad_norm": 0.601054847240448, + "learning_rate": 0.0002, + "loss": 1.5991, + "step": 4230 + }, + { + "epoch": 3.5510887772194306, + "grad_norm": 0.7004473805427551, + "learning_rate": 0.0002, + "loss": 1.4974, + "step": 4240 + }, + { + "epoch": 3.559463986599665, + "grad_norm": 0.6601553559303284, + "learning_rate": 0.0002, + "loss": 1.5993, + "step": 4250 + }, + { + "epoch": 3.5678391959798996, + "grad_norm": 0.6112467050552368, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4260 + }, + { + "epoch": 3.576214405360134, + "grad_norm": 0.5902454853057861, + "learning_rate": 0.0002, + "loss": 1.4967, + "step": 4270 + }, + { + "epoch": 3.5845896147403686, + "grad_norm": 0.5792450904846191, + "learning_rate": 0.0002, + "loss": 1.5659, + "step": 4280 + }, + { + "epoch": 3.592964824120603, + "grad_norm": 0.5923888087272644, + "learning_rate": 0.0002, + "loss": 1.4664, + "step": 4290 + }, + { + "epoch": 3.6013400335008376, + "grad_norm": 0.5869482159614563, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 4300 + }, + { + "epoch": 3.609715242881072, + "grad_norm": 0.6372929811477661, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 4310 + }, + { + "epoch": 3.6180904522613067, + "grad_norm": 0.6350686550140381, + "learning_rate": 0.0002, + "loss": 1.4977, + "step": 4320 + }, + { + "epoch": 3.626465661641541, + "grad_norm": 0.571819007396698, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 4330 + }, + { + "epoch": 3.6348408710217757, + "grad_norm": 0.592250645160675, + "learning_rate": 0.0002, + "loss": 1.5414, + "step": 4340 + }, + { + "epoch": 3.64321608040201, + "grad_norm": 0.6110650897026062, + "learning_rate": 0.0002, + "loss": 1.4912, + "step": 4350 + }, + { + "epoch": 3.6515912897822447, + "grad_norm": 0.6187081336975098, + "learning_rate": 0.0002, + "loss": 1.6089, + "step": 4360 + }, + { + "epoch": 3.659966499162479, + "grad_norm": 0.6197671890258789, + "learning_rate": 0.0002, + "loss": 1.5345, + "step": 4370 + }, + { + "epoch": 3.6683417085427137, + "grad_norm": 0.6050862669944763, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 4380 + }, + { + "epoch": 3.676716917922948, + "grad_norm": 0.621265172958374, + "learning_rate": 0.0002, + "loss": 1.4872, + "step": 4390 + }, + { + "epoch": 3.6850921273031827, + "grad_norm": 0.6552940011024475, + "learning_rate": 0.0002, + "loss": 1.6011, + "step": 4400 + }, + { + "epoch": 3.693467336683417, + "grad_norm": 0.5638861060142517, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 4410 + }, + { + "epoch": 3.7018425460636517, + "grad_norm": 0.6388863325119019, + "learning_rate": 0.0002, + "loss": 1.4985, + "step": 4420 + }, + { + "epoch": 3.710217755443886, + "grad_norm": 0.6062559485435486, + "learning_rate": 0.0002, + "loss": 1.3696, + "step": 4430 + }, + { + "epoch": 3.7185929648241207, + "grad_norm": 0.5800350308418274, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 4440 + }, + { + "epoch": 3.726968174204355, + "grad_norm": 0.5954474210739136, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 4450 + }, + { + "epoch": 3.7353433835845897, + "grad_norm": 0.5880125761032104, + "learning_rate": 0.0002, + "loss": 1.6133, + "step": 4460 + }, + { + "epoch": 3.7437185929648242, + "grad_norm": 0.5880921483039856, + "learning_rate": 0.0002, + "loss": 1.5055, + "step": 4470 + }, + { + "epoch": 3.7520938023450587, + "grad_norm": 0.5995073914527893, + "learning_rate": 0.0002, + "loss": 1.5728, + "step": 4480 + }, + { + "epoch": 3.7604690117252932, + "grad_norm": 0.5958493947982788, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 4490 + }, + { + "epoch": 3.7688442211055277, + "grad_norm": 0.5694711804389954, + "learning_rate": 0.0002, + "loss": 1.5472, + "step": 4500 + }, + { + "epoch": 3.7772194304857623, + "grad_norm": 0.6175141930580139, + "learning_rate": 0.0002, + "loss": 1.5105, + "step": 4510 + }, + { + "epoch": 3.7855946398659968, + "grad_norm": 0.5541581511497498, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 4520 + }, + { + "epoch": 3.7939698492462313, + "grad_norm": 0.5986164808273315, + "learning_rate": 0.0002, + "loss": 1.5283, + "step": 4530 + }, + { + "epoch": 3.8023450586264658, + "grad_norm": 0.640072226524353, + "learning_rate": 0.0002, + "loss": 1.4961, + "step": 4540 + }, + { + "epoch": 3.8107202680067003, + "grad_norm": 0.5742579698562622, + "learning_rate": 0.0002, + "loss": 1.5297, + "step": 4550 + }, + { + "epoch": 3.819095477386935, + "grad_norm": 0.6658656001091003, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4560 + }, + { + "epoch": 3.8274706867671693, + "grad_norm": 0.5475369691848755, + "learning_rate": 0.0002, + "loss": 1.4992, + "step": 4570 + }, + { + "epoch": 3.835845896147404, + "grad_norm": 0.613172173500061, + "learning_rate": 0.0002, + "loss": 1.5966, + "step": 4580 + }, + { + "epoch": 3.8442211055276383, + "grad_norm": 0.590968132019043, + "learning_rate": 0.0002, + "loss": 1.5594, + "step": 4590 + }, + { + "epoch": 3.852596314907873, + "grad_norm": 0.5865461826324463, + "learning_rate": 0.0002, + "loss": 1.5067, + "step": 4600 + }, + { + "epoch": 3.8609715242881073, + "grad_norm": 0.6815178990364075, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4610 + }, + { + "epoch": 3.869346733668342, + "grad_norm": 0.6551400423049927, + "learning_rate": 0.0002, + "loss": 1.5702, + "step": 4620 + }, + { + "epoch": 3.8777219430485763, + "grad_norm": 0.6398897171020508, + "learning_rate": 0.0002, + "loss": 1.4891, + "step": 4630 + }, + { + "epoch": 3.886097152428811, + "grad_norm": 0.6761762499809265, + "learning_rate": 0.0002, + "loss": 1.5353, + "step": 4640 + }, + { + "epoch": 3.8944723618090453, + "grad_norm": 0.6277294754981995, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 4650 + }, + { + "epoch": 3.90284757118928, + "grad_norm": 0.6285301446914673, + "learning_rate": 0.0002, + "loss": 1.5605, + "step": 4660 + }, + { + "epoch": 3.9112227805695143, + "grad_norm": 0.5416069626808167, + "learning_rate": 0.0002, + "loss": 1.5937, + "step": 4670 + }, + { + "epoch": 3.919597989949749, + "grad_norm": 0.6314545273780823, + "learning_rate": 0.0002, + "loss": 1.5461, + "step": 4680 + }, + { + "epoch": 3.9279731993299833, + "grad_norm": 0.604479968547821, + "learning_rate": 0.0002, + "loss": 1.4828, + "step": 4690 + }, + { + "epoch": 3.936348408710218, + "grad_norm": 0.5321660041809082, + "learning_rate": 0.0002, + "loss": 1.5186, + "step": 4700 + }, + { + "epoch": 3.9447236180904524, + "grad_norm": 0.6632516980171204, + "learning_rate": 0.0002, + "loss": 1.4696, + "step": 4710 + }, + { + "epoch": 3.953098827470687, + "grad_norm": 0.5925896763801575, + "learning_rate": 0.0002, + "loss": 1.519, + "step": 4720 + }, + { + "epoch": 3.9614740368509214, + "grad_norm": 0.6580308675765991, + "learning_rate": 0.0002, + "loss": 1.5716, + "step": 4730 + }, + { + "epoch": 3.969849246231156, + "grad_norm": 0.5578170418739319, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 4740 + }, + { + "epoch": 3.9782244556113904, + "grad_norm": 0.6216608285903931, + "learning_rate": 0.0002, + "loss": 1.5394, + "step": 4750 + }, + { + "epoch": 3.986599664991625, + "grad_norm": 0.5693069696426392, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 4760 + }, + { + "epoch": 3.9949748743718594, + "grad_norm": 0.5353434681892395, + "learning_rate": 0.0002, + "loss": 1.5517, + "step": 4770 + }, + { + "epoch": 4.0, + "eval_loss": 1.8809821605682373, + "eval_runtime": 37.9695, + "eval_samples_per_second": 13.564, + "eval_steps_per_second": 1.712, + "step": 4776 + }, + { + "epoch": 4.0033500837520934, + "grad_norm": 0.6117817759513855, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 4780 + }, + { + "epoch": 4.011725293132328, + "grad_norm": 0.6816073656082153, + "learning_rate": 0.0002, + "loss": 1.2982, + "step": 4790 + }, + { + "epoch": 4.0201005025125625, + "grad_norm": 0.715548038482666, + "learning_rate": 0.0002, + "loss": 1.3464, + "step": 4800 + }, + { + "epoch": 4.028475711892797, + "grad_norm": 0.8585814833641052, + "learning_rate": 0.0002, + "loss": 1.3918, + "step": 4810 + }, + { + "epoch": 4.0368509212730315, + "grad_norm": 0.7372158765792847, + "learning_rate": 0.0002, + "loss": 1.4137, + "step": 4820 + }, + { + "epoch": 4.045226130653266, + "grad_norm": 0.8915117979049683, + "learning_rate": 0.0002, + "loss": 1.3769, + "step": 4830 + }, + { + "epoch": 4.0536013400335005, + "grad_norm": 0.9323588013648987, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 4840 + }, + { + "epoch": 4.061976549413735, + "grad_norm": 0.9298437237739563, + "learning_rate": 0.0002, + "loss": 1.3687, + "step": 4850 + }, + { + "epoch": 4.0703517587939695, + "grad_norm": 0.8541792035102844, + "learning_rate": 0.0002, + "loss": 1.4173, + "step": 4860 + }, + { + "epoch": 4.078726968174204, + "grad_norm": 0.7833571434020996, + "learning_rate": 0.0002, + "loss": 1.3668, + "step": 4870 + }, + { + "epoch": 4.0871021775544385, + "grad_norm": 0.9325295090675354, + "learning_rate": 0.0002, + "loss": 1.3835, + "step": 4880 + }, + { + "epoch": 4.0954773869346734, + "grad_norm": 0.7066370248794556, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 4890 + }, + { + "epoch": 4.1038525963149075, + "grad_norm": 0.712640643119812, + "learning_rate": 0.0002, + "loss": 1.3661, + "step": 4900 + }, + { + "epoch": 4.1122278056951425, + "grad_norm": 0.6970218420028687, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 4910 + }, + { + "epoch": 4.1206030150753765, + "grad_norm": 0.7979312539100647, + "learning_rate": 0.0002, + "loss": 1.3805, + "step": 4920 + }, + { + "epoch": 4.1289782244556115, + "grad_norm": 0.7801558375358582, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4930 + }, + { + "epoch": 4.1373534338358455, + "grad_norm": 0.7505159974098206, + "learning_rate": 0.0002, + "loss": 1.3288, + "step": 4940 + }, + { + "epoch": 4.1457286432160805, + "grad_norm": 0.738201916217804, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 4950 + }, + { + "epoch": 4.1541038525963145, + "grad_norm": 0.7736659049987793, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 4960 + }, + { + "epoch": 4.1624790619765495, + "grad_norm": 0.7850064635276794, + "learning_rate": 0.0002, + "loss": 1.3663, + "step": 4970 + }, + { + "epoch": 4.1708542713567835, + "grad_norm": 0.8316620588302612, + "learning_rate": 0.0002, + "loss": 1.326, + "step": 4980 + }, + { + "epoch": 4.1792294807370185, + "grad_norm": 0.7217330932617188, + "learning_rate": 0.0002, + "loss": 1.377, + "step": 4990 + }, + { + "epoch": 4.187604690117253, + "grad_norm": 0.7050199508666992, + "learning_rate": 0.0002, + "loss": 1.3299, + "step": 5000 + }, + { + "epoch": 4.1959798994974875, + "grad_norm": 0.6992659568786621, + "learning_rate": 0.0002, + "loss": 1.3798, + "step": 5010 + }, + { + "epoch": 4.204355108877722, + "grad_norm": 0.7648445963859558, + "learning_rate": 0.0002, + "loss": 1.3391, + "step": 5020 + }, + { + "epoch": 4.2127303182579565, + "grad_norm": 0.8093137741088867, + "learning_rate": 0.0002, + "loss": 1.3339, + "step": 5030 + }, + { + "epoch": 4.221105527638191, + "grad_norm": 0.6907750368118286, + "learning_rate": 0.0002, + "loss": 1.37, + "step": 5040 + }, + { + "epoch": 4.2294807370184255, + "grad_norm": 0.7000078558921814, + "learning_rate": 0.0002, + "loss": 1.4231, + "step": 5050 + }, + { + "epoch": 4.23785594639866, + "grad_norm": 0.715034008026123, + "learning_rate": 0.0002, + "loss": 1.3411, + "step": 5060 + }, + { + "epoch": 4.2462311557788945, + "grad_norm": 0.828895628452301, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5070 + }, + { + "epoch": 4.254606365159129, + "grad_norm": 0.7127292156219482, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 5080 + }, + { + "epoch": 4.2629815745393635, + "grad_norm": 0.8256623148918152, + "learning_rate": 0.0002, + "loss": 1.4255, + "step": 5090 + }, + { + "epoch": 4.271356783919598, + "grad_norm": 0.8062452077865601, + "learning_rate": 0.0002, + "loss": 1.4078, + "step": 5100 + }, + { + "epoch": 4.279731993299833, + "grad_norm": 0.6861081123352051, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5110 + }, + { + "epoch": 4.288107202680067, + "grad_norm": 0.7566041350364685, + "learning_rate": 0.0002, + "loss": 1.3463, + "step": 5120 + }, + { + "epoch": 4.296482412060302, + "grad_norm": 0.8734753727912903, + "learning_rate": 0.0002, + "loss": 1.4571, + "step": 5130 + }, + { + "epoch": 4.304857621440536, + "grad_norm": 0.8559320569038391, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5140 + }, + { + "epoch": 4.313232830820771, + "grad_norm": 0.6965576410293579, + "learning_rate": 0.0002, + "loss": 1.3551, + "step": 5150 + }, + { + "epoch": 4.321608040201005, + "grad_norm": 0.8277813792228699, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 5160 + }, + { + "epoch": 4.32998324958124, + "grad_norm": 1.0733633041381836, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5170 + }, + { + "epoch": 4.338358458961474, + "grad_norm": 0.7914809584617615, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5180 + }, + { + "epoch": 4.346733668341709, + "grad_norm": 0.8307849168777466, + "learning_rate": 0.0002, + "loss": 1.3907, + "step": 5190 + }, + { + "epoch": 4.355108877721943, + "grad_norm": 0.7066516280174255, + "learning_rate": 0.0002, + "loss": 1.4318, + "step": 5200 + }, + { + "epoch": 4.363484087102178, + "grad_norm": 0.9676792025566101, + "learning_rate": 0.0002, + "loss": 1.3866, + "step": 5210 + }, + { + "epoch": 4.371859296482412, + "grad_norm": 0.7672301530838013, + "learning_rate": 0.0002, + "loss": 1.3973, + "step": 5220 + }, + { + "epoch": 4.380234505862647, + "grad_norm": 0.6888260245323181, + "learning_rate": 0.0002, + "loss": 1.3576, + "step": 5230 + }, + { + "epoch": 4.388609715242881, + "grad_norm": 0.8775295615196228, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 5240 + }, + { + "epoch": 4.396984924623116, + "grad_norm": 0.8742642998695374, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 5250 + }, + { + "epoch": 4.40536013400335, + "grad_norm": 0.6935433745384216, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 5260 + }, + { + "epoch": 4.413735343383585, + "grad_norm": 0.7726178169250488, + "learning_rate": 0.0002, + "loss": 1.3605, + "step": 5270 + }, + { + "epoch": 4.422110552763819, + "grad_norm": 0.7493860721588135, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5280 + }, + { + "epoch": 4.430485762144054, + "grad_norm": 0.7758517265319824, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 5290 + }, + { + "epoch": 4.438860971524288, + "grad_norm": 0.779315173625946, + "learning_rate": 0.0002, + "loss": 1.2916, + "step": 5300 + }, + { + "epoch": 4.447236180904523, + "grad_norm": 0.7753667235374451, + "learning_rate": 0.0002, + "loss": 1.4483, + "step": 5310 + }, + { + "epoch": 4.455611390284757, + "grad_norm": 0.8738188743591309, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 5320 + }, + { + "epoch": 4.463986599664992, + "grad_norm": 0.8410757184028625, + "learning_rate": 0.0002, + "loss": 1.41, + "step": 5330 + }, + { + "epoch": 4.472361809045226, + "grad_norm": 0.728897750377655, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 5340 + }, + { + "epoch": 4.480737018425461, + "grad_norm": 0.7880531549453735, + "learning_rate": 0.0002, + "loss": 1.4049, + "step": 5350 + }, + { + "epoch": 4.489112227805695, + "grad_norm": 0.8455142378807068, + "learning_rate": 0.0002, + "loss": 1.4106, + "step": 5360 + }, + { + "epoch": 4.49748743718593, + "grad_norm": 0.8527868986129761, + "learning_rate": 0.0002, + "loss": 1.431, + "step": 5370 + }, + { + "epoch": 4.505862646566165, + "grad_norm": 0.7743009328842163, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5380 + }, + { + "epoch": 4.514237855946399, + "grad_norm": 0.7555320858955383, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 5390 + }, + { + "epoch": 4.522613065326633, + "grad_norm": 0.8146619200706482, + "learning_rate": 0.0002, + "loss": 1.3433, + "step": 5400 + }, + { + "epoch": 4.530988274706868, + "grad_norm": 0.8042502999305725, + "learning_rate": 0.0002, + "loss": 1.4859, + "step": 5410 + }, + { + "epoch": 4.539363484087103, + "grad_norm": 0.7329140305519104, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5420 + }, + { + "epoch": 4.547738693467337, + "grad_norm": 0.7574753165245056, + "learning_rate": 0.0002, + "loss": 1.3946, + "step": 5430 + }, + { + "epoch": 4.556113902847571, + "grad_norm": 1.1223409175872803, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 5440 + }, + { + "epoch": 4.564489112227806, + "grad_norm": 0.7647369503974915, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5450 + }, + { + "epoch": 4.572864321608041, + "grad_norm": 0.9135531187057495, + "learning_rate": 0.0002, + "loss": 1.4569, + "step": 5460 + }, + { + "epoch": 4.581239530988275, + "grad_norm": 0.9343693852424622, + "learning_rate": 0.0002, + "loss": 1.4813, + "step": 5470 + }, + { + "epoch": 4.589614740368509, + "grad_norm": 0.869945764541626, + "learning_rate": 0.0002, + "loss": 1.385, + "step": 5480 + }, + { + "epoch": 4.597989949748744, + "grad_norm": 0.7383785843849182, + "learning_rate": 0.0002, + "loss": 1.4067, + "step": 5490 + }, + { + "epoch": 4.606365159128979, + "grad_norm": 0.7988699674606323, + "learning_rate": 0.0002, + "loss": 1.3698, + "step": 5500 + }, + { + "epoch": 4.614740368509213, + "grad_norm": 0.8731256127357483, + "learning_rate": 0.0002, + "loss": 1.3834, + "step": 5510 + }, + { + "epoch": 4.623115577889447, + "grad_norm": 0.7577664256095886, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 5520 + }, + { + "epoch": 4.631490787269682, + "grad_norm": 0.7825039625167847, + "learning_rate": 0.0002, + "loss": 1.4418, + "step": 5530 + }, + { + "epoch": 4.639865996649917, + "grad_norm": 0.8534902930259705, + "learning_rate": 0.0002, + "loss": 1.4594, + "step": 5540 + }, + { + "epoch": 4.648241206030151, + "grad_norm": 0.7403318285942078, + "learning_rate": 0.0002, + "loss": 1.3689, + "step": 5550 + }, + { + "epoch": 4.656616415410385, + "grad_norm": 0.8229990005493164, + "learning_rate": 0.0002, + "loss": 1.4456, + "step": 5560 + }, + { + "epoch": 4.66499162479062, + "grad_norm": 0.8279513716697693, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5570 + }, + { + "epoch": 4.673366834170855, + "grad_norm": 0.8923851251602173, + "learning_rate": 0.0002, + "loss": 1.4472, + "step": 5580 + }, + { + "epoch": 4.681742043551089, + "grad_norm": 0.7457540035247803, + "learning_rate": 0.0002, + "loss": 1.3999, + "step": 5590 + }, + { + "epoch": 4.690117252931323, + "grad_norm": 0.7110715508460999, + "learning_rate": 0.0002, + "loss": 1.4341, + "step": 5600 + }, + { + "epoch": 4.698492462311558, + "grad_norm": 0.7135499119758606, + "learning_rate": 0.0002, + "loss": 1.4327, + "step": 5610 + }, + { + "epoch": 4.706867671691793, + "grad_norm": 0.7606837153434753, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 5620 + }, + { + "epoch": 4.715242881072027, + "grad_norm": 0.9622916579246521, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5630 + }, + { + "epoch": 4.723618090452261, + "grad_norm": 0.7665684819221497, + "learning_rate": 0.0002, + "loss": 1.4, + "step": 5640 + }, + { + "epoch": 4.731993299832496, + "grad_norm": 0.7985475659370422, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 5650 + }, + { + "epoch": 4.740368509212731, + "grad_norm": 0.9179279208183289, + "learning_rate": 0.0002, + "loss": 1.397, + "step": 5660 + }, + { + "epoch": 4.748743718592965, + "grad_norm": 0.8311634063720703, + "learning_rate": 0.0002, + "loss": 1.4379, + "step": 5670 + }, + { + "epoch": 4.757118927973199, + "grad_norm": 0.7773269414901733, + "learning_rate": 0.0002, + "loss": 1.3546, + "step": 5680 + }, + { + "epoch": 4.765494137353434, + "grad_norm": 0.7771748900413513, + "learning_rate": 0.0002, + "loss": 1.4031, + "step": 5690 + }, + { + "epoch": 4.773869346733669, + "grad_norm": 0.7518507242202759, + "learning_rate": 0.0002, + "loss": 1.3724, + "step": 5700 + }, + { + "epoch": 4.782244556113903, + "grad_norm": 0.7699326276779175, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 5710 + }, + { + "epoch": 4.790619765494137, + "grad_norm": 0.7001115679740906, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5720 + }, + { + "epoch": 4.798994974874372, + "grad_norm": 0.7220682501792908, + "learning_rate": 0.0002, + "loss": 1.4257, + "step": 5730 + }, + { + "epoch": 4.807370184254607, + "grad_norm": 0.7654005289077759, + "learning_rate": 0.0002, + "loss": 1.4174, + "step": 5740 + }, + { + "epoch": 4.815745393634841, + "grad_norm": 0.8132795095443726, + "learning_rate": 0.0002, + "loss": 1.3792, + "step": 5750 + }, + { + "epoch": 4.824120603015075, + "grad_norm": 0.7105404138565063, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5760 + }, + { + "epoch": 4.83249581239531, + "grad_norm": 0.9346209764480591, + "learning_rate": 0.0002, + "loss": 1.4289, + "step": 5770 + }, + { + "epoch": 4.840871021775545, + "grad_norm": 1.0075623989105225, + "learning_rate": 0.0002, + "loss": 1.4066, + "step": 5780 + }, + { + "epoch": 4.849246231155779, + "grad_norm": 0.758376955986023, + "learning_rate": 0.0002, + "loss": 1.4558, + "step": 5790 + }, + { + "epoch": 4.857621440536013, + "grad_norm": 0.854821503162384, + "learning_rate": 0.0002, + "loss": 1.4117, + "step": 5800 + }, + { + "epoch": 4.865996649916248, + "grad_norm": 0.8226943016052246, + "learning_rate": 0.0002, + "loss": 1.4014, + "step": 5810 + }, + { + "epoch": 4.874371859296483, + "grad_norm": 0.7510473728179932, + "learning_rate": 0.0002, + "loss": 1.3963, + "step": 5820 + }, + { + "epoch": 4.882747068676717, + "grad_norm": 0.7449678182601929, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 5830 + }, + { + "epoch": 4.891122278056951, + "grad_norm": 0.7840824723243713, + "learning_rate": 0.0002, + "loss": 1.3691, + "step": 5840 + }, + { + "epoch": 4.899497487437186, + "grad_norm": 0.8811169862747192, + "learning_rate": 0.0002, + "loss": 1.3795, + "step": 5850 + }, + { + "epoch": 4.907872696817421, + "grad_norm": 0.84914630651474, + "learning_rate": 0.0002, + "loss": 1.3827, + "step": 5860 + }, + { + "epoch": 4.916247906197655, + "grad_norm": 0.7514461874961853, + "learning_rate": 0.0002, + "loss": 1.4549, + "step": 5870 + }, + { + "epoch": 4.924623115577889, + "grad_norm": 0.7229002118110657, + "learning_rate": 0.0002, + "loss": 1.3633, + "step": 5880 + }, + { + "epoch": 4.932998324958124, + "grad_norm": 0.9418245553970337, + "learning_rate": 0.0002, + "loss": 1.4302, + "step": 5890 + }, + { + "epoch": 4.941373534338359, + "grad_norm": 0.7626827359199524, + "learning_rate": 0.0002, + "loss": 1.4747, + "step": 5900 + }, + { + "epoch": 4.949748743718593, + "grad_norm": 0.7711105346679688, + "learning_rate": 0.0002, + "loss": 1.4462, + "step": 5910 + }, + { + "epoch": 4.958123953098827, + "grad_norm": 0.8689648509025574, + "learning_rate": 0.0002, + "loss": 1.4104, + "step": 5920 + }, + { + "epoch": 4.966499162479062, + "grad_norm": 0.7873271107673645, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 5930 + }, + { + "epoch": 4.974874371859297, + "grad_norm": 0.7637495994567871, + "learning_rate": 0.0002, + "loss": 1.4361, + "step": 5940 + }, + { + "epoch": 4.983249581239531, + "grad_norm": 0.9907955527305603, + "learning_rate": 0.0002, + "loss": 1.5037, + "step": 5950 + }, + { + "epoch": 4.991624790619765, + "grad_norm": 0.7827328443527222, + "learning_rate": 0.0002, + "loss": 1.4476, + "step": 5960 + }, + { + "epoch": 5.0, + "grad_norm": 0.818544328212738, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5970 + }, + { + "epoch": 5.0, + "eval_loss": 1.9436752796173096, + "eval_runtime": 38.087, + "eval_samples_per_second": 13.522, + "eval_steps_per_second": 1.707, + "step": 5970 + }, + { + "epoch": 5.008375209380235, + "grad_norm": 1.1248953342437744, + "learning_rate": 0.0002, + "loss": 1.2367, + "step": 5980 + }, + { + "epoch": 5.016750418760469, + "grad_norm": 0.9285888075828552, + "learning_rate": 0.0002, + "loss": 1.2221, + "step": 5990 + }, + { + "epoch": 5.025125628140704, + "grad_norm": 0.8626338839530945, + "learning_rate": 0.0002, + "loss": 1.263, + "step": 6000 + }, + { + "epoch": 5.033500837520938, + "grad_norm": 0.8253921270370483, + "learning_rate": 0.0002, + "loss": 1.1839, + "step": 6010 + }, + { + "epoch": 5.041876046901173, + "grad_norm": 1.079628586769104, + "learning_rate": 0.0002, + "loss": 1.2773, + "step": 6020 + }, + { + "epoch": 5.050251256281407, + "grad_norm": 0.902625322341919, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 6030 + }, + { + "epoch": 5.058626465661642, + "grad_norm": 0.9593151211738586, + "learning_rate": 0.0002, + "loss": 1.164, + "step": 6040 + }, + { + "epoch": 5.067001675041876, + "grad_norm": 0.9276060461997986, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 6050 + }, + { + "epoch": 5.075376884422111, + "grad_norm": 1.0472362041473389, + "learning_rate": 0.0002, + "loss": 1.2496, + "step": 6060 + }, + { + "epoch": 5.083752093802345, + "grad_norm": 0.9126865863800049, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 6070 + }, + { + "epoch": 5.09212730318258, + "grad_norm": 1.0797888040542603, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 6080 + }, + { + "epoch": 5.100502512562814, + "grad_norm": 0.9538877010345459, + "learning_rate": 0.0002, + "loss": 1.2299, + "step": 6090 + }, + { + "epoch": 5.108877721943049, + "grad_norm": 1.0604161024093628, + "learning_rate": 0.0002, + "loss": 1.2585, + "step": 6100 + }, + { + "epoch": 5.117252931323283, + "grad_norm": 1.0178192853927612, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6110 + }, + { + "epoch": 5.125628140703517, + "grad_norm": 1.0262689590454102, + "learning_rate": 0.0002, + "loss": 1.2848, + "step": 6120 + }, + { + "epoch": 5.134003350083752, + "grad_norm": 0.9046729803085327, + "learning_rate": 0.0002, + "loss": 1.228, + "step": 6130 + }, + { + "epoch": 5.142378559463987, + "grad_norm": 1.1244608163833618, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 6140 + }, + { + "epoch": 5.150753768844221, + "grad_norm": 1.082835078239441, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 6150 + }, + { + "epoch": 5.159128978224456, + "grad_norm": 0.9078734517097473, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 6160 + }, + { + "epoch": 5.16750418760469, + "grad_norm": 1.0688848495483398, + "learning_rate": 0.0002, + "loss": 1.2122, + "step": 6170 + }, + { + "epoch": 5.175879396984925, + "grad_norm": 1.137519359588623, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 6180 + }, + { + "epoch": 5.184254606365159, + "grad_norm": 1.0728670358657837, + "learning_rate": 0.0002, + "loss": 1.3125, + "step": 6190 + }, + { + "epoch": 5.192629815745394, + "grad_norm": 1.2384949922561646, + "learning_rate": 0.0002, + "loss": 1.2352, + "step": 6200 + }, + { + "epoch": 5.201005025125628, + "grad_norm": 0.8391274809837341, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 6210 + }, + { + "epoch": 5.209380234505863, + "grad_norm": 0.8948764801025391, + "learning_rate": 0.0002, + "loss": 1.2179, + "step": 6220 + }, + { + "epoch": 5.217755443886097, + "grad_norm": 0.9568309783935547, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6230 + }, + { + "epoch": 5.226130653266332, + "grad_norm": 1.0604485273361206, + "learning_rate": 0.0002, + "loss": 1.2761, + "step": 6240 + }, + { + "epoch": 5.234505862646566, + "grad_norm": 1.1278935670852661, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 6250 + }, + { + "epoch": 5.242881072026801, + "grad_norm": 0.9903607368469238, + "learning_rate": 0.0002, + "loss": 1.2332, + "step": 6260 + }, + { + "epoch": 5.251256281407035, + "grad_norm": 0.958718478679657, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 6270 + }, + { + "epoch": 5.259631490787269, + "grad_norm": 1.127510905265808, + "learning_rate": 0.0002, + "loss": 1.2746, + "step": 6280 + }, + { + "epoch": 5.268006700167504, + "grad_norm": 1.1683127880096436, + "learning_rate": 0.0002, + "loss": 1.2589, + "step": 6290 + }, + { + "epoch": 5.276381909547739, + "grad_norm": 1.0723326206207275, + "learning_rate": 0.0002, + "loss": 1.2959, + "step": 6300 + }, + { + "epoch": 5.284757118927973, + "grad_norm": 0.9285374283790588, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 6310 + }, + { + "epoch": 5.293132328308207, + "grad_norm": 0.9201741218566895, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6320 + }, + { + "epoch": 5.301507537688442, + "grad_norm": 0.9606702923774719, + "learning_rate": 0.0002, + "loss": 1.1816, + "step": 6330 + }, + { + "epoch": 5.309882747068677, + "grad_norm": 1.107960820198059, + "learning_rate": 0.0002, + "loss": 1.2928, + "step": 6340 + }, + { + "epoch": 5.318257956448911, + "grad_norm": 0.9342933297157288, + "learning_rate": 0.0002, + "loss": 1.209, + "step": 6350 + }, + { + "epoch": 5.326633165829146, + "grad_norm": 0.9170576930046082, + "learning_rate": 0.0002, + "loss": 1.2023, + "step": 6360 + }, + { + "epoch": 5.33500837520938, + "grad_norm": 0.7612091898918152, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 6370 + }, + { + "epoch": 5.343383584589615, + "grad_norm": 1.2524093389511108, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 6380 + }, + { + "epoch": 5.351758793969849, + "grad_norm": 0.8481650352478027, + "learning_rate": 0.0002, + "loss": 1.219, + "step": 6390 + }, + { + "epoch": 5.360134003350084, + "grad_norm": 1.0562204122543335, + "learning_rate": 0.0002, + "loss": 1.237, + "step": 6400 + }, + { + "epoch": 5.368509212730318, + "grad_norm": 0.96522456407547, + "learning_rate": 0.0002, + "loss": 1.1844, + "step": 6410 + }, + { + "epoch": 5.376884422110553, + "grad_norm": 0.9680143594741821, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 6420 + }, + { + "epoch": 5.385259631490787, + "grad_norm": 0.9743781685829163, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 6430 + }, + { + "epoch": 5.393634840871022, + "grad_norm": 0.8907374143600464, + "learning_rate": 0.0002, + "loss": 1.2637, + "step": 6440 + }, + { + "epoch": 5.402010050251256, + "grad_norm": 1.3755217790603638, + "learning_rate": 0.0002, + "loss": 1.2174, + "step": 6450 + }, + { + "epoch": 5.410385259631491, + "grad_norm": 1.1926233768463135, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 6460 + }, + { + "epoch": 5.418760469011725, + "grad_norm": 0.8343448638916016, + "learning_rate": 0.0002, + "loss": 1.1685, + "step": 6470 + }, + { + "epoch": 5.42713567839196, + "grad_norm": 1.0056027173995972, + "learning_rate": 0.0002, + "loss": 1.232, + "step": 6480 + }, + { + "epoch": 5.435510887772194, + "grad_norm": 0.9482131600379944, + "learning_rate": 0.0002, + "loss": 1.2936, + "step": 6490 + }, + { + "epoch": 5.443886097152429, + "grad_norm": 0.9766585826873779, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 6500 + }, + { + "epoch": 5.452261306532663, + "grad_norm": 0.9226584434509277, + "learning_rate": 0.0002, + "loss": 1.2758, + "step": 6510 + }, + { + "epoch": 5.460636515912898, + "grad_norm": 0.9605025053024292, + "learning_rate": 0.0002, + "loss": 1.328, + "step": 6520 + }, + { + "epoch": 5.469011725293132, + "grad_norm": 1.0022773742675781, + "learning_rate": 0.0002, + "loss": 1.3285, + "step": 6530 + }, + { + "epoch": 5.477386934673367, + "grad_norm": 1.056764841079712, + "learning_rate": 0.0002, + "loss": 1.3126, + "step": 6540 + }, + { + "epoch": 5.485762144053601, + "grad_norm": 0.9648325443267822, + "learning_rate": 0.0002, + "loss": 1.3018, + "step": 6550 + }, + { + "epoch": 5.494137353433836, + "grad_norm": 0.8987206816673279, + "learning_rate": 0.0002, + "loss": 1.2633, + "step": 6560 + }, + { + "epoch": 5.50251256281407, + "grad_norm": 1.1946845054626465, + "learning_rate": 0.0002, + "loss": 1.2356, + "step": 6570 + }, + { + "epoch": 5.510887772194305, + "grad_norm": 1.037416696548462, + "learning_rate": 0.0002, + "loss": 1.2613, + "step": 6580 + }, + { + "epoch": 5.519262981574539, + "grad_norm": 1.085598349571228, + "learning_rate": 0.0002, + "loss": 1.2873, + "step": 6590 + }, + { + "epoch": 5.527638190954773, + "grad_norm": 0.9253745079040527, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6600 + }, + { + "epoch": 5.536013400335008, + "grad_norm": 1.0624418258666992, + "learning_rate": 0.0002, + "loss": 1.3037, + "step": 6610 + }, + { + "epoch": 5.544388609715243, + "grad_norm": 1.002821922302246, + "learning_rate": 0.0002, + "loss": 1.2523, + "step": 6620 + }, + { + "epoch": 5.552763819095477, + "grad_norm": 0.9343662858009338, + "learning_rate": 0.0002, + "loss": 1.2662, + "step": 6630 + }, + { + "epoch": 5.561139028475711, + "grad_norm": 0.9129965305328369, + "learning_rate": 0.0002, + "loss": 1.2467, + "step": 6640 + }, + { + "epoch": 5.569514237855946, + "grad_norm": 1.220263957977295, + "learning_rate": 0.0002, + "loss": 1.2931, + "step": 6650 + }, + { + "epoch": 5.577889447236181, + "grad_norm": 0.9705421924591064, + "learning_rate": 0.0002, + "loss": 1.2638, + "step": 6660 + }, + { + "epoch": 5.586264656616415, + "grad_norm": 0.8417587876319885, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 6670 + }, + { + "epoch": 5.594639865996649, + "grad_norm": 0.9351304769515991, + "learning_rate": 0.0002, + "loss": 1.3616, + "step": 6680 + }, + { + "epoch": 5.603015075376884, + "grad_norm": 1.012598991394043, + "learning_rate": 0.0002, + "loss": 1.2795, + "step": 6690 + }, + { + "epoch": 5.611390284757119, + "grad_norm": 1.018328309059143, + "learning_rate": 0.0002, + "loss": 1.2457, + "step": 6700 + }, + { + "epoch": 5.619765494137353, + "grad_norm": 0.9289278388023376, + "learning_rate": 0.0002, + "loss": 1.3084, + "step": 6710 + }, + { + "epoch": 5.628140703517588, + "grad_norm": 0.8390841484069824, + "learning_rate": 0.0002, + "loss": 1.2645, + "step": 6720 + }, + { + "epoch": 5.636515912897822, + "grad_norm": 0.9989390969276428, + "learning_rate": 0.0002, + "loss": 1.2676, + "step": 6730 + }, + { + "epoch": 5.644891122278057, + "grad_norm": 1.0675761699676514, + "learning_rate": 0.0002, + "loss": 1.2937, + "step": 6740 + }, + { + "epoch": 5.653266331658291, + "grad_norm": 1.0649791955947876, + "learning_rate": 0.0002, + "loss": 1.2599, + "step": 6750 + }, + { + "epoch": 5.661641541038526, + "grad_norm": 0.8542222380638123, + "learning_rate": 0.0002, + "loss": 1.2191, + "step": 6760 + }, + { + "epoch": 5.67001675041876, + "grad_norm": 0.9148173928260803, + "learning_rate": 0.0002, + "loss": 1.2336, + "step": 6770 + }, + { + "epoch": 5.678391959798995, + "grad_norm": 0.978024423122406, + "learning_rate": 0.0002, + "loss": 1.3286, + "step": 6780 + }, + { + "epoch": 5.686767169179229, + "grad_norm": 1.0385138988494873, + "learning_rate": 0.0002, + "loss": 1.2821, + "step": 6790 + }, + { + "epoch": 5.695142378559464, + "grad_norm": 0.9687889218330383, + "learning_rate": 0.0002, + "loss": 1.218, + "step": 6800 + }, + { + "epoch": 5.703517587939698, + "grad_norm": 0.862335205078125, + "learning_rate": 0.0002, + "loss": 1.3256, + "step": 6810 + }, + { + "epoch": 5.711892797319933, + "grad_norm": 0.9729578495025635, + "learning_rate": 0.0002, + "loss": 1.2783, + "step": 6820 + }, + { + "epoch": 5.720268006700167, + "grad_norm": 0.8936806321144104, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6830 + }, + { + "epoch": 5.728643216080402, + "grad_norm": 0.9222455620765686, + "learning_rate": 0.0002, + "loss": 1.27, + "step": 6840 + }, + { + "epoch": 5.7370184254606365, + "grad_norm": 1.0584437847137451, + "learning_rate": 0.0002, + "loss": 1.2097, + "step": 6850 + }, + { + "epoch": 5.745393634840871, + "grad_norm": 0.9114518165588379, + "learning_rate": 0.0002, + "loss": 1.2308, + "step": 6860 + }, + { + "epoch": 5.7537688442211055, + "grad_norm": 0.9590078592300415, + "learning_rate": 0.0002, + "loss": 1.2767, + "step": 6870 + }, + { + "epoch": 5.76214405360134, + "grad_norm": 0.9056822061538696, + "learning_rate": 0.0002, + "loss": 1.2639, + "step": 6880 + }, + { + "epoch": 5.7705192629815745, + "grad_norm": 1.0069063901901245, + "learning_rate": 0.0002, + "loss": 1.3257, + "step": 6890 + }, + { + "epoch": 5.778894472361809, + "grad_norm": 0.9810041189193726, + "learning_rate": 0.0002, + "loss": 1.3382, + "step": 6900 + }, + { + "epoch": 5.7872696817420435, + "grad_norm": 0.881629228591919, + "learning_rate": 0.0002, + "loss": 1.2907, + "step": 6910 + }, + { + "epoch": 5.795644891122278, + "grad_norm": 1.1020095348358154, + "learning_rate": 0.0002, + "loss": 1.3122, + "step": 6920 + }, + { + "epoch": 5.8040201005025125, + "grad_norm": 0.8774619102478027, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 6930 + }, + { + "epoch": 5.812395309882747, + "grad_norm": 0.9321739673614502, + "learning_rate": 0.0002, + "loss": 1.311, + "step": 6940 + }, + { + "epoch": 5.8207705192629815, + "grad_norm": 0.9082857966423035, + "learning_rate": 0.0002, + "loss": 1.2951, + "step": 6950 + }, + { + "epoch": 5.8291457286432165, + "grad_norm": 0.9119554758071899, + "learning_rate": 0.0002, + "loss": 1.2582, + "step": 6960 + }, + { + "epoch": 5.8375209380234505, + "grad_norm": 1.0643284320831299, + "learning_rate": 0.0002, + "loss": 1.2777, + "step": 6970 + }, + { + "epoch": 5.8458961474036855, + "grad_norm": 0.8526089787483215, + "learning_rate": 0.0002, + "loss": 1.3319, + "step": 6980 + }, + { + "epoch": 5.8542713567839195, + "grad_norm": 0.930439829826355, + "learning_rate": 0.0002, + "loss": 1.2539, + "step": 6990 + }, + { + "epoch": 5.8626465661641545, + "grad_norm": 1.0461677312850952, + "learning_rate": 0.0002, + "loss": 1.3059, + "step": 7000 + }, + { + "epoch": 5.8710217755443885, + "grad_norm": 0.92561936378479, + "learning_rate": 0.0002, + "loss": 1.2623, + "step": 7010 + }, + { + "epoch": 5.8793969849246235, + "grad_norm": 0.8936395049095154, + "learning_rate": 0.0002, + "loss": 1.2354, + "step": 7020 + }, + { + "epoch": 5.8877721943048575, + "grad_norm": 0.986539363861084, + "learning_rate": 0.0002, + "loss": 1.3232, + "step": 7030 + }, + { + "epoch": 5.8961474036850925, + "grad_norm": 0.8776476383209229, + "learning_rate": 0.0002, + "loss": 1.2399, + "step": 7040 + }, + { + "epoch": 5.9045226130653266, + "grad_norm": 1.0256905555725098, + "learning_rate": 0.0002, + "loss": 1.2374, + "step": 7050 + }, + { + "epoch": 5.9128978224455615, + "grad_norm": 0.96241295337677, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 7060 + }, + { + "epoch": 5.921273031825796, + "grad_norm": 1.0251280069351196, + "learning_rate": 0.0002, + "loss": 1.2349, + "step": 7070 + }, + { + "epoch": 5.9296482412060305, + "grad_norm": 1.0794076919555664, + "learning_rate": 0.0002, + "loss": 1.2225, + "step": 7080 + }, + { + "epoch": 5.938023450586265, + "grad_norm": 0.9852448105812073, + "learning_rate": 0.0002, + "loss": 1.2978, + "step": 7090 + }, + { + "epoch": 5.9463986599664995, + "grad_norm": 1.1678671836853027, + "learning_rate": 0.0002, + "loss": 1.3278, + "step": 7100 + }, + { + "epoch": 5.954773869346734, + "grad_norm": 0.9818310141563416, + "learning_rate": 0.0002, + "loss": 1.2908, + "step": 7110 + }, + { + "epoch": 5.9631490787269685, + "grad_norm": 1.0732046365737915, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 7120 + }, + { + "epoch": 5.971524288107203, + "grad_norm": 0.912470281124115, + "learning_rate": 0.0002, + "loss": 1.2402, + "step": 7130 + }, + { + "epoch": 5.9798994974874375, + "grad_norm": 1.0944788455963135, + "learning_rate": 0.0002, + "loss": 1.2979, + "step": 7140 + }, + { + "epoch": 5.988274706867672, + "grad_norm": 1.0393965244293213, + "learning_rate": 0.0002, + "loss": 1.3249, + "step": 7150 + }, + { + "epoch": 5.9966499162479066, + "grad_norm": 0.8758739233016968, + "learning_rate": 0.0002, + "loss": 1.2913, + "step": 7160 + }, + { + "epoch": 6.0, + "eval_loss": 2.0526134967803955, + "eval_runtime": 37.9699, + "eval_samples_per_second": 13.563, + "eval_steps_per_second": 1.712, + "step": 7164 + }, + { + "epoch": 6.005025125628141, + "grad_norm": 1.138184666633606, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 7170 + }, + { + "epoch": 6.013400335008376, + "grad_norm": 0.9295315742492676, + "learning_rate": 0.0002, + "loss": 1.0727, + "step": 7180 + }, + { + "epoch": 6.02177554438861, + "grad_norm": 1.1252633333206177, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 7190 + }, + { + "epoch": 6.030150753768845, + "grad_norm": 1.0611635446548462, + "learning_rate": 0.0002, + "loss": 1.0827, + "step": 7200 + }, + { + "epoch": 6.038525963149079, + "grad_norm": 1.022278070449829, + "learning_rate": 0.0002, + "loss": 1.0756, + "step": 7210 + }, + { + "epoch": 6.046901172529314, + "grad_norm": 1.0280728340148926, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 7220 + }, + { + "epoch": 6.055276381909548, + "grad_norm": 0.9516313076019287, + "learning_rate": 0.0002, + "loss": 1.0237, + "step": 7230 + }, + { + "epoch": 6.063651591289783, + "grad_norm": 1.0925321578979492, + "learning_rate": 0.0002, + "loss": 1.0388, + "step": 7240 + }, + { + "epoch": 6.072026800670017, + "grad_norm": 0.9885565042495728, + "learning_rate": 0.0002, + "loss": 1.113, + "step": 7250 + }, + { + "epoch": 6.080402010050252, + "grad_norm": 1.0905766487121582, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 7260 + }, + { + "epoch": 6.088777219430486, + "grad_norm": 1.075183391571045, + "learning_rate": 0.0002, + "loss": 1.0775, + "step": 7270 + }, + { + "epoch": 6.097152428810721, + "grad_norm": 1.0897727012634277, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 7280 + }, + { + "epoch": 6.105527638190955, + "grad_norm": 1.3677806854248047, + "learning_rate": 0.0002, + "loss": 1.0335, + "step": 7290 + }, + { + "epoch": 6.11390284757119, + "grad_norm": 1.1880329847335815, + "learning_rate": 0.0002, + "loss": 1.0566, + "step": 7300 + }, + { + "epoch": 6.122278056951424, + "grad_norm": 1.036330223083496, + "learning_rate": 0.0002, + "loss": 1.061, + "step": 7310 + }, + { + "epoch": 6.130653266331659, + "grad_norm": 1.2165348529815674, + "learning_rate": 0.0002, + "loss": 1.0621, + "step": 7320 + }, + { + "epoch": 6.139028475711893, + "grad_norm": 1.027368187904358, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 7330 + }, + { + "epoch": 6.147403685092128, + "grad_norm": 1.2497830390930176, + "learning_rate": 0.0002, + "loss": 1.0994, + "step": 7340 + }, + { + "epoch": 6.155778894472362, + "grad_norm": 1.166595458984375, + "learning_rate": 0.0002, + "loss": 1.1616, + "step": 7350 + }, + { + "epoch": 6.164154103852597, + "grad_norm": 1.1143730878829956, + "learning_rate": 0.0002, + "loss": 1.1301, + "step": 7360 + }, + { + "epoch": 6.172529313232831, + "grad_norm": 1.1531223058700562, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 7370 + }, + { + "epoch": 6.180904522613066, + "grad_norm": 1.176507830619812, + "learning_rate": 0.0002, + "loss": 1.0819, + "step": 7380 + }, + { + "epoch": 6.1892797319933, + "grad_norm": 1.3174604177474976, + "learning_rate": 0.0002, + "loss": 1.0375, + "step": 7390 + }, + { + "epoch": 6.197654941373535, + "grad_norm": 1.0284459590911865, + "learning_rate": 0.0002, + "loss": 1.1586, + "step": 7400 + }, + { + "epoch": 6.206030150753769, + "grad_norm": 1.0801599025726318, + "learning_rate": 0.0002, + "loss": 1.1044, + "step": 7410 + }, + { + "epoch": 6.214405360134004, + "grad_norm": 1.200514554977417, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 7420 + }, + { + "epoch": 6.222780569514238, + "grad_norm": 1.0148060321807861, + "learning_rate": 0.0002, + "loss": 1.0234, + "step": 7430 + }, + { + "epoch": 6.231155778894473, + "grad_norm": 1.2368836402893066, + "learning_rate": 0.0002, + "loss": 1.0616, + "step": 7440 + }, + { + "epoch": 6.239530988274707, + "grad_norm": 1.228834629058838, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 7450 + }, + { + "epoch": 6.247906197654942, + "grad_norm": 1.1588891744613647, + "learning_rate": 0.0002, + "loss": 1.1128, + "step": 7460 + }, + { + "epoch": 6.256281407035176, + "grad_norm": 1.3500380516052246, + "learning_rate": 0.0002, + "loss": 1.0807, + "step": 7470 + }, + { + "epoch": 6.264656616415411, + "grad_norm": 1.1429533958435059, + "learning_rate": 0.0002, + "loss": 1.1057, + "step": 7480 + }, + { + "epoch": 6.273031825795645, + "grad_norm": 1.2314441204071045, + "learning_rate": 0.0002, + "loss": 1.1519, + "step": 7490 + }, + { + "epoch": 6.28140703517588, + "grad_norm": 1.0917996168136597, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 7500 + }, + { + "epoch": 6.289782244556114, + "grad_norm": 1.3294450044631958, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 7510 + }, + { + "epoch": 6.298157453936349, + "grad_norm": 1.1035195589065552, + "learning_rate": 0.0002, + "loss": 1.1187, + "step": 7520 + }, + { + "epoch": 6.306532663316583, + "grad_norm": 1.2643269300460815, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 7530 + }, + { + "epoch": 6.314907872696818, + "grad_norm": 1.2226417064666748, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7540 + }, + { + "epoch": 6.323283082077052, + "grad_norm": 1.0248615741729736, + "learning_rate": 0.0002, + "loss": 1.1335, + "step": 7550 + }, + { + "epoch": 6.331658291457287, + "grad_norm": 1.28317129611969, + "learning_rate": 0.0002, + "loss": 1.0856, + "step": 7560 + }, + { + "epoch": 6.340033500837521, + "grad_norm": 1.1461660861968994, + "learning_rate": 0.0002, + "loss": 1.166, + "step": 7570 + }, + { + "epoch": 6.348408710217756, + "grad_norm": 1.297136664390564, + "learning_rate": 0.0002, + "loss": 1.1627, + "step": 7580 + }, + { + "epoch": 6.35678391959799, + "grad_norm": 1.3376781940460205, + "learning_rate": 0.0002, + "loss": 1.1342, + "step": 7590 + }, + { + "epoch": 6.365159128978225, + "grad_norm": 1.2507376670837402, + "learning_rate": 0.0002, + "loss": 1.072, + "step": 7600 + }, + { + "epoch": 6.373534338358459, + "grad_norm": 1.3255126476287842, + "learning_rate": 0.0002, + "loss": 1.0731, + "step": 7610 + }, + { + "epoch": 6.381909547738694, + "grad_norm": 1.1082066297531128, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 7620 + }, + { + "epoch": 6.390284757118928, + "grad_norm": 1.4461497068405151, + "learning_rate": 0.0002, + "loss": 1.0894, + "step": 7630 + }, + { + "epoch": 6.398659966499163, + "grad_norm": 1.2875033617019653, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 7640 + }, + { + "epoch": 6.407035175879397, + "grad_norm": 1.1017295122146606, + "learning_rate": 0.0002, + "loss": 1.1027, + "step": 7650 + }, + { + "epoch": 6.415410385259632, + "grad_norm": 1.1896536350250244, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 7660 + }, + { + "epoch": 6.423785594639866, + "grad_norm": 1.0939011573791504, + "learning_rate": 0.0002, + "loss": 1.1207, + "step": 7670 + }, + { + "epoch": 6.432160804020101, + "grad_norm": 1.2593132257461548, + "learning_rate": 0.0002, + "loss": 1.1338, + "step": 7680 + }, + { + "epoch": 6.440536013400335, + "grad_norm": 1.1151225566864014, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 7690 + }, + { + "epoch": 6.44891122278057, + "grad_norm": 1.0686280727386475, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 7700 + }, + { + "epoch": 6.457286432160804, + "grad_norm": 1.4008738994598389, + "learning_rate": 0.0002, + "loss": 1.1611, + "step": 7710 + }, + { + "epoch": 6.465661641541039, + "grad_norm": 1.1698687076568604, + "learning_rate": 0.0002, + "loss": 1.1191, + "step": 7720 + }, + { + "epoch": 6.474036850921273, + "grad_norm": 1.1306401491165161, + "learning_rate": 0.0002, + "loss": 1.1637, + "step": 7730 + }, + { + "epoch": 6.482412060301508, + "grad_norm": 1.2970236539840698, + "learning_rate": 0.0002, + "loss": 1.1534, + "step": 7740 + }, + { + "epoch": 6.490787269681742, + "grad_norm": 1.1515544652938843, + "learning_rate": 0.0002, + "loss": 1.1408, + "step": 7750 + }, + { + "epoch": 6.499162479061977, + "grad_norm": 1.13273024559021, + "learning_rate": 0.0002, + "loss": 1.098, + "step": 7760 + }, + { + "epoch": 6.507537688442211, + "grad_norm": 1.1635724306106567, + "learning_rate": 0.0002, + "loss": 1.1356, + "step": 7770 + }, + { + "epoch": 6.515912897822446, + "grad_norm": 1.1620264053344727, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 7780 + }, + { + "epoch": 6.52428810720268, + "grad_norm": 1.159905195236206, + "learning_rate": 0.0002, + "loss": 1.1786, + "step": 7790 + }, + { + "epoch": 6.532663316582915, + "grad_norm": 1.2243341207504272, + "learning_rate": 0.0002, + "loss": 1.1252, + "step": 7800 + }, + { + "epoch": 6.541038525963149, + "grad_norm": 1.1034481525421143, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 7810 + }, + { + "epoch": 6.549413735343384, + "grad_norm": 1.1131408214569092, + "learning_rate": 0.0002, + "loss": 1.1579, + "step": 7820 + }, + { + "epoch": 6.557788944723618, + "grad_norm": 1.211260199546814, + "learning_rate": 0.0002, + "loss": 1.1053, + "step": 7830 + }, + { + "epoch": 6.566164154103853, + "grad_norm": 1.408692717552185, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 7840 + }, + { + "epoch": 6.574539363484087, + "grad_norm": 1.151441216468811, + "learning_rate": 0.0002, + "loss": 1.1586, + "step": 7850 + }, + { + "epoch": 6.582914572864322, + "grad_norm": 1.1160012483596802, + "learning_rate": 0.0002, + "loss": 1.1754, + "step": 7860 + }, + { + "epoch": 6.591289782244556, + "grad_norm": 1.2496052980422974, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 7870 + }, + { + "epoch": 6.599664991624791, + "grad_norm": 1.559907078742981, + "learning_rate": 0.0002, + "loss": 1.2007, + "step": 7880 + }, + { + "epoch": 6.608040201005025, + "grad_norm": 1.4399309158325195, + "learning_rate": 0.0002, + "loss": 1.1482, + "step": 7890 + }, + { + "epoch": 6.61641541038526, + "grad_norm": 1.155007243156433, + "learning_rate": 0.0002, + "loss": 1.1801, + "step": 7900 + }, + { + "epoch": 6.624790619765494, + "grad_norm": 1.4339076280593872, + "learning_rate": 0.0002, + "loss": 1.2029, + "step": 7910 + }, + { + "epoch": 6.633165829145729, + "grad_norm": 1.2093058824539185, + "learning_rate": 0.0002, + "loss": 1.1594, + "step": 7920 + }, + { + "epoch": 6.641541038525963, + "grad_norm": 1.1619434356689453, + "learning_rate": 0.0002, + "loss": 1.185, + "step": 7930 + }, + { + "epoch": 6.649916247906198, + "grad_norm": 1.2879594564437866, + "learning_rate": 0.0002, + "loss": 1.1369, + "step": 7940 + }, + { + "epoch": 6.658291457286432, + "grad_norm": 1.0598394870758057, + "learning_rate": 0.0002, + "loss": 1.1992, + "step": 7950 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0937503576278687, + "learning_rate": 0.0002, + "loss": 1.1337, + "step": 7960 + }, + { + "epoch": 6.675041876046901, + "grad_norm": 1.2670115232467651, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 7970 + }, + { + "epoch": 6.683417085427136, + "grad_norm": 1.2351782321929932, + "learning_rate": 0.0002, + "loss": 1.1711, + "step": 7980 + }, + { + "epoch": 6.69179229480737, + "grad_norm": 1.344128131866455, + "learning_rate": 0.0002, + "loss": 1.1774, + "step": 7990 + }, + { + "epoch": 6.700167504187605, + "grad_norm": 1.2894740104675293, + "learning_rate": 0.0002, + "loss": 1.1739, + "step": 8000 + }, + { + "epoch": 6.708542713567839, + "grad_norm": 1.1804684400558472, + "learning_rate": 0.0002, + "loss": 1.1045, + "step": 8010 + }, + { + "epoch": 6.716917922948074, + "grad_norm": 1.314237356185913, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 8020 + }, + { + "epoch": 6.725293132328308, + "grad_norm": 1.2132530212402344, + "learning_rate": 0.0002, + "loss": 1.1113, + "step": 8030 + }, + { + "epoch": 6.733668341708543, + "grad_norm": 0.999580979347229, + "learning_rate": 0.0002, + "loss": 1.1467, + "step": 8040 + }, + { + "epoch": 6.742043551088777, + "grad_norm": 1.206323266029358, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 8050 + }, + { + "epoch": 6.750418760469012, + "grad_norm": 1.1092344522476196, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 8060 + }, + { + "epoch": 6.758793969849246, + "grad_norm": 1.0168755054473877, + "learning_rate": 0.0002, + "loss": 1.1583, + "step": 8070 + }, + { + "epoch": 6.767169179229481, + "grad_norm": 1.2310614585876465, + "learning_rate": 0.0002, + "loss": 1.189, + "step": 8080 + }, + { + "epoch": 6.775544388609715, + "grad_norm": 1.1587172746658325, + "learning_rate": 0.0002, + "loss": 1.1775, + "step": 8090 + }, + { + "epoch": 6.78391959798995, + "grad_norm": 1.1362504959106445, + "learning_rate": 0.0002, + "loss": 1.1761, + "step": 8100 + }, + { + "epoch": 6.792294807370184, + "grad_norm": 1.3735119104385376, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 8110 + }, + { + "epoch": 6.800670016750419, + "grad_norm": 1.1804813146591187, + "learning_rate": 0.0002, + "loss": 1.1214, + "step": 8120 + }, + { + "epoch": 6.809045226130653, + "grad_norm": 1.1849592924118042, + "learning_rate": 0.0002, + "loss": 1.1035, + "step": 8130 + }, + { + "epoch": 6.817420435510888, + "grad_norm": 1.1638602018356323, + "learning_rate": 0.0002, + "loss": 1.1622, + "step": 8140 + }, + { + "epoch": 6.825795644891122, + "grad_norm": 1.2106250524520874, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 8150 + }, + { + "epoch": 6.834170854271357, + "grad_norm": 1.276068091392517, + "learning_rate": 0.0002, + "loss": 1.2231, + "step": 8160 + }, + { + "epoch": 6.842546063651591, + "grad_norm": 1.4283488988876343, + "learning_rate": 0.0002, + "loss": 1.1309, + "step": 8170 + }, + { + "epoch": 6.850921273031826, + "grad_norm": 1.4286448955535889, + "learning_rate": 0.0002, + "loss": 1.1494, + "step": 8180 + }, + { + "epoch": 6.85929648241206, + "grad_norm": 1.191275715827942, + "learning_rate": 0.0002, + "loss": 1.185, + "step": 8190 + }, + { + "epoch": 6.867671691792295, + "grad_norm": 1.4232908487319946, + "learning_rate": 0.0002, + "loss": 1.1984, + "step": 8200 + }, + { + "epoch": 6.876046901172529, + "grad_norm": 1.2166317701339722, + "learning_rate": 0.0002, + "loss": 1.182, + "step": 8210 + }, + { + "epoch": 6.884422110552764, + "grad_norm": 1.0487027168273926, + "learning_rate": 0.0002, + "loss": 1.1311, + "step": 8220 + }, + { + "epoch": 6.892797319932998, + "grad_norm": 1.247178077697754, + "learning_rate": 0.0002, + "loss": 1.1973, + "step": 8230 + }, + { + "epoch": 6.901172529313233, + "grad_norm": 1.0728635787963867, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 8240 + }, + { + "epoch": 6.909547738693467, + "grad_norm": 1.1909451484680176, + "learning_rate": 0.0002, + "loss": 1.2106, + "step": 8250 + }, + { + "epoch": 6.917922948073702, + "grad_norm": 1.337556004524231, + "learning_rate": 0.0002, + "loss": 1.1336, + "step": 8260 + }, + { + "epoch": 6.926298157453936, + "grad_norm": 1.1479394435882568, + "learning_rate": 0.0002, + "loss": 1.2295, + "step": 8270 + }, + { + "epoch": 6.934673366834171, + "grad_norm": 1.2038872241973877, + "learning_rate": 0.0002, + "loss": 1.1497, + "step": 8280 + }, + { + "epoch": 6.943048576214405, + "grad_norm": 1.088813066482544, + "learning_rate": 0.0002, + "loss": 1.1806, + "step": 8290 + }, + { + "epoch": 6.95142378559464, + "grad_norm": 1.0153290033340454, + "learning_rate": 0.0002, + "loss": 1.181, + "step": 8300 + }, + { + "epoch": 6.959798994974874, + "grad_norm": 1.2159703969955444, + "learning_rate": 0.0002, + "loss": 1.1846, + "step": 8310 + }, + { + "epoch": 6.968174204355109, + "grad_norm": 1.0844143629074097, + "learning_rate": 0.0002, + "loss": 1.1029, + "step": 8320 + }, + { + "epoch": 6.976549413735343, + "grad_norm": 1.1617385149002075, + "learning_rate": 0.0002, + "loss": 1.1843, + "step": 8330 + }, + { + "epoch": 6.984924623115578, + "grad_norm": 1.126503586769104, + "learning_rate": 0.0002, + "loss": 1.177, + "step": 8340 + }, + { + "epoch": 6.993299832495812, + "grad_norm": 1.1553548574447632, + "learning_rate": 0.0002, + "loss": 1.1753, + "step": 8350 + }, + { + "epoch": 7.0, + "eval_loss": 2.1463968753814697, + "eval_runtime": 37.9219, + "eval_samples_per_second": 13.581, + "eval_steps_per_second": 1.714, + "step": 8358 + }, + { + "epoch": 7.001675041876047, + "grad_norm": 1.0229777097702026, + "learning_rate": 0.0002, + "loss": 1.1205, + "step": 8360 + }, + { + "epoch": 7.010050251256281, + "grad_norm": 1.2346612215042114, + "learning_rate": 0.0002, + "loss": 0.9556, + "step": 8370 + }, + { + "epoch": 7.018425460636516, + "grad_norm": 1.2478288412094116, + "learning_rate": 0.0002, + "loss": 0.9406, + "step": 8380 + }, + { + "epoch": 7.02680067001675, + "grad_norm": 1.3081458806991577, + "learning_rate": 0.0002, + "loss": 0.9603, + "step": 8390 + }, + { + "epoch": 7.035175879396985, + "grad_norm": 1.508225440979004, + "learning_rate": 0.0002, + "loss": 0.9594, + "step": 8400 + }, + { + "epoch": 7.043551088777219, + "grad_norm": 1.7482528686523438, + "learning_rate": 0.0002, + "loss": 0.9472, + "step": 8410 + }, + { + "epoch": 7.051926298157454, + "grad_norm": 1.3465625047683716, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 8420 + }, + { + "epoch": 7.060301507537688, + "grad_norm": 1.3181530237197876, + "learning_rate": 0.0002, + "loss": 0.9683, + "step": 8430 + }, + { + "epoch": 7.068676716917923, + "grad_norm": 1.2666151523590088, + "learning_rate": 0.0002, + "loss": 0.9296, + "step": 8440 + }, + { + "epoch": 7.077051926298157, + "grad_norm": 1.5192651748657227, + "learning_rate": 0.0002, + "loss": 0.989, + "step": 8450 + }, + { + "epoch": 7.085427135678392, + "grad_norm": 1.3075478076934814, + "learning_rate": 0.0002, + "loss": 0.9281, + "step": 8460 + }, + { + "epoch": 7.093802345058626, + "grad_norm": 1.0856449604034424, + "learning_rate": 0.0002, + "loss": 0.9098, + "step": 8470 + }, + { + "epoch": 7.102177554438861, + "grad_norm": 1.299716830253601, + "learning_rate": 0.0002, + "loss": 0.9813, + "step": 8480 + }, + { + "epoch": 7.110552763819095, + "grad_norm": 1.4345086812973022, + "learning_rate": 0.0002, + "loss": 0.9572, + "step": 8490 + }, + { + "epoch": 7.11892797319933, + "grad_norm": 1.4502071142196655, + "learning_rate": 0.0002, + "loss": 0.9705, + "step": 8500 + }, + { + "epoch": 7.127303182579564, + "grad_norm": 1.315466284751892, + "learning_rate": 0.0002, + "loss": 0.9073, + "step": 8510 + }, + { + "epoch": 7.135678391959799, + "grad_norm": 1.2893296480178833, + "learning_rate": 0.0002, + "loss": 0.9635, + "step": 8520 + }, + { + "epoch": 7.144053601340033, + "grad_norm": 1.4431706666946411, + "learning_rate": 0.0002, + "loss": 0.9636, + "step": 8530 + }, + { + "epoch": 7.152428810720268, + "grad_norm": 1.2943761348724365, + "learning_rate": 0.0002, + "loss": 0.9761, + "step": 8540 + }, + { + "epoch": 7.160804020100502, + "grad_norm": 1.2941267490386963, + "learning_rate": 0.0002, + "loss": 1.0148, + "step": 8550 + }, + { + "epoch": 7.169179229480737, + "grad_norm": 1.1535996198654175, + "learning_rate": 0.0002, + "loss": 0.9336, + "step": 8560 + }, + { + "epoch": 7.177554438860971, + "grad_norm": 1.4487816095352173, + "learning_rate": 0.0002, + "loss": 0.9691, + "step": 8570 + }, + { + "epoch": 7.185929648241206, + "grad_norm": 1.2985659837722778, + "learning_rate": 0.0002, + "loss": 0.9904, + "step": 8580 + }, + { + "epoch": 7.19430485762144, + "grad_norm": 1.2589194774627686, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 8590 + }, + { + "epoch": 7.202680067001675, + "grad_norm": 1.327163815498352, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 8600 + }, + { + "epoch": 7.211055276381909, + "grad_norm": 1.2303123474121094, + "learning_rate": 0.0002, + "loss": 0.9809, + "step": 8610 + }, + { + "epoch": 7.219430485762144, + "grad_norm": 1.5056939125061035, + "learning_rate": 0.0002, + "loss": 0.967, + "step": 8620 + }, + { + "epoch": 7.227805695142378, + "grad_norm": 1.5022825002670288, + "learning_rate": 0.0002, + "loss": 0.987, + "step": 8630 + }, + { + "epoch": 7.236180904522613, + "grad_norm": 1.3092796802520752, + "learning_rate": 0.0002, + "loss": 1.0659, + "step": 8640 + }, + { + "epoch": 7.244556113902847, + "grad_norm": 1.2752959728240967, + "learning_rate": 0.0002, + "loss": 0.9434, + "step": 8650 + }, + { + "epoch": 7.252931323283082, + "grad_norm": 1.2906183004379272, + "learning_rate": 0.0002, + "loss": 0.9833, + "step": 8660 + }, + { + "epoch": 7.261306532663316, + "grad_norm": 1.6165488958358765, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 8670 + }, + { + "epoch": 7.269681742043551, + "grad_norm": 1.5356138944625854, + "learning_rate": 0.0002, + "loss": 1.0087, + "step": 8680 + }, + { + "epoch": 7.278056951423785, + "grad_norm": 1.4998574256896973, + "learning_rate": 0.0002, + "loss": 1.0101, + "step": 8690 + }, + { + "epoch": 7.28643216080402, + "grad_norm": 1.3943705558776855, + "learning_rate": 0.0002, + "loss": 0.9908, + "step": 8700 + }, + { + "epoch": 7.294807370184254, + "grad_norm": 1.2478622198104858, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 8710 + }, + { + "epoch": 7.303182579564489, + "grad_norm": 1.6093883514404297, + "learning_rate": 0.0002, + "loss": 0.9419, + "step": 8720 + }, + { + "epoch": 7.311557788944723, + "grad_norm": 1.2838177680969238, + "learning_rate": 0.0002, + "loss": 0.9502, + "step": 8730 + }, + { + "epoch": 7.319932998324958, + "grad_norm": 1.3537026643753052, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 8740 + }, + { + "epoch": 7.328308207705192, + "grad_norm": 1.5077383518218994, + "learning_rate": 0.0002, + "loss": 0.9632, + "step": 8750 + }, + { + "epoch": 7.336683417085427, + "grad_norm": 1.4921475648880005, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 8760 + }, + { + "epoch": 7.345058626465661, + "grad_norm": 1.3573282957077026, + "learning_rate": 0.0002, + "loss": 0.9919, + "step": 8770 + }, + { + "epoch": 7.353433835845896, + "grad_norm": 1.3224730491638184, + "learning_rate": 0.0002, + "loss": 1.0483, + "step": 8780 + }, + { + "epoch": 7.36180904522613, + "grad_norm": 1.3497579097747803, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 8790 + }, + { + "epoch": 7.370184254606365, + "grad_norm": 1.1072763204574585, + "learning_rate": 0.0002, + "loss": 0.9853, + "step": 8800 + }, + { + "epoch": 7.3785594639865995, + "grad_norm": 1.3373113870620728, + "learning_rate": 0.0002, + "loss": 1.0036, + "step": 8810 + }, + { + "epoch": 7.386934673366834, + "grad_norm": 1.2301900386810303, + "learning_rate": 0.0002, + "loss": 0.9636, + "step": 8820 + }, + { + "epoch": 7.3953098827470685, + "grad_norm": 1.4248781204223633, + "learning_rate": 0.0002, + "loss": 0.9903, + "step": 8830 + }, + { + "epoch": 7.403685092127303, + "grad_norm": 1.6177928447723389, + "learning_rate": 0.0002, + "loss": 0.9802, + "step": 8840 + }, + { + "epoch": 7.4120603015075375, + "grad_norm": 1.3096086978912354, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 8850 + }, + { + "epoch": 7.420435510887772, + "grad_norm": 1.5262911319732666, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 8860 + }, + { + "epoch": 7.4288107202680065, + "grad_norm": 1.7313627004623413, + "learning_rate": 0.0002, + "loss": 0.9894, + "step": 8870 + }, + { + "epoch": 7.437185929648241, + "grad_norm": 1.3323025703430176, + "learning_rate": 0.0002, + "loss": 0.9834, + "step": 8880 + }, + { + "epoch": 7.4455611390284755, + "grad_norm": 1.3253904581069946, + "learning_rate": 0.0002, + "loss": 1.0052, + "step": 8890 + }, + { + "epoch": 7.45393634840871, + "grad_norm": 1.3685275316238403, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 8900 + }, + { + "epoch": 7.4623115577889445, + "grad_norm": 1.4222962856292725, + "learning_rate": 0.0002, + "loss": 1.0126, + "step": 8910 + }, + { + "epoch": 7.4706867671691795, + "grad_norm": 1.429887056350708, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 8920 + }, + { + "epoch": 7.4790619765494135, + "grad_norm": 1.455110788345337, + "learning_rate": 0.0002, + "loss": 1.0003, + "step": 8930 + }, + { + "epoch": 7.4874371859296485, + "grad_norm": 1.298094630241394, + "learning_rate": 0.0002, + "loss": 1.0206, + "step": 8940 + }, + { + "epoch": 7.4958123953098825, + "grad_norm": 1.280696988105774, + "learning_rate": 0.0002, + "loss": 1.0263, + "step": 8950 + }, + { + "epoch": 7.5041876046901175, + "grad_norm": 1.2990771532058716, + "learning_rate": 0.0002, + "loss": 1.0196, + "step": 8960 + }, + { + "epoch": 7.5125628140703515, + "grad_norm": 1.5361275672912598, + "learning_rate": 0.0002, + "loss": 0.9732, + "step": 8970 + }, + { + "epoch": 7.5209380234505865, + "grad_norm": 1.2716164588928223, + "learning_rate": 0.0002, + "loss": 0.9778, + "step": 8980 + }, + { + "epoch": 7.5293132328308205, + "grad_norm": 1.5293556451797485, + "learning_rate": 0.0002, + "loss": 1.0031, + "step": 8990 + }, + { + "epoch": 7.5376884422110555, + "grad_norm": 1.5210952758789062, + "learning_rate": 0.0002, + "loss": 0.9817, + "step": 9000 + }, + { + "epoch": 7.54606365159129, + "grad_norm": 1.2735507488250732, + "learning_rate": 0.0002, + "loss": 0.9998, + "step": 9010 + }, + { + "epoch": 7.5544388609715245, + "grad_norm": 1.3383569717407227, + "learning_rate": 0.0002, + "loss": 1.001, + "step": 9020 + }, + { + "epoch": 7.562814070351759, + "grad_norm": 1.471486210823059, + "learning_rate": 0.0002, + "loss": 0.9423, + "step": 9030 + }, + { + "epoch": 7.5711892797319935, + "grad_norm": 1.4516266584396362, + "learning_rate": 0.0002, + "loss": 1.0043, + "step": 9040 + }, + { + "epoch": 7.579564489112228, + "grad_norm": 1.8539457321166992, + "learning_rate": 0.0002, + "loss": 1.0154, + "step": 9050 + }, + { + "epoch": 7.5879396984924625, + "grad_norm": 1.394018292427063, + "learning_rate": 0.0002, + "loss": 0.9901, + "step": 9060 + }, + { + "epoch": 7.596314907872697, + "grad_norm": 1.4161924123764038, + "learning_rate": 0.0002, + "loss": 1.0031, + "step": 9070 + }, + { + "epoch": 7.6046901172529315, + "grad_norm": 1.5264959335327148, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 9080 + }, + { + "epoch": 7.613065326633166, + "grad_norm": 1.3996148109436035, + "learning_rate": 0.0002, + "loss": 0.9758, + "step": 9090 + }, + { + "epoch": 7.6214405360134005, + "grad_norm": 1.485904574394226, + "learning_rate": 0.0002, + "loss": 1.027, + "step": 9100 + }, + { + "epoch": 7.629815745393635, + "grad_norm": 1.361729621887207, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 9110 + }, + { + "epoch": 7.63819095477387, + "grad_norm": 1.3930991888046265, + "learning_rate": 0.0002, + "loss": 1.0794, + "step": 9120 + }, + { + "epoch": 7.646566164154104, + "grad_norm": 1.3981443643569946, + "learning_rate": 0.0002, + "loss": 1.0524, + "step": 9130 + }, + { + "epoch": 7.654941373534339, + "grad_norm": 1.325538158416748, + "learning_rate": 0.0002, + "loss": 1.0171, + "step": 9140 + }, + { + "epoch": 7.663316582914573, + "grad_norm": 1.7479078769683838, + "learning_rate": 0.0002, + "loss": 1.0579, + "step": 9150 + }, + { + "epoch": 7.671691792294808, + "grad_norm": 1.6959037780761719, + "learning_rate": 0.0002, + "loss": 0.9984, + "step": 9160 + }, + { + "epoch": 7.680067001675042, + "grad_norm": 1.218790054321289, + "learning_rate": 0.0002, + "loss": 1.0603, + "step": 9170 + }, + { + "epoch": 7.688442211055277, + "grad_norm": 1.4050689935684204, + "learning_rate": 0.0002, + "loss": 1.0529, + "step": 9180 + }, + { + "epoch": 7.696817420435511, + "grad_norm": 1.361841082572937, + "learning_rate": 0.0002, + "loss": 0.9908, + "step": 9190 + }, + { + "epoch": 7.705192629815746, + "grad_norm": 1.1516344547271729, + "learning_rate": 0.0002, + "loss": 1.0738, + "step": 9200 + }, + { + "epoch": 7.71356783919598, + "grad_norm": 1.5105586051940918, + "learning_rate": 0.0002, + "loss": 1.0146, + "step": 9210 + }, + { + "epoch": 7.721943048576215, + "grad_norm": 1.4226511716842651, + "learning_rate": 0.0002, + "loss": 1.0912, + "step": 9220 + }, + { + "epoch": 7.730318257956449, + "grad_norm": 1.4334726333618164, + "learning_rate": 0.0002, + "loss": 1.0109, + "step": 9230 + }, + { + "epoch": 7.738693467336684, + "grad_norm": 1.144550085067749, + "learning_rate": 0.0002, + "loss": 0.9502, + "step": 9240 + }, + { + "epoch": 7.747068676716918, + "grad_norm": 1.292710781097412, + "learning_rate": 0.0002, + "loss": 0.9771, + "step": 9250 + }, + { + "epoch": 7.755443886097153, + "grad_norm": 1.3884655237197876, + "learning_rate": 0.0002, + "loss": 1.0247, + "step": 9260 + }, + { + "epoch": 7.763819095477387, + "grad_norm": 1.5045685768127441, + "learning_rate": 0.0002, + "loss": 1.0467, + "step": 9270 + }, + { + "epoch": 7.772194304857622, + "grad_norm": 1.3433866500854492, + "learning_rate": 0.0002, + "loss": 1.0393, + "step": 9280 + }, + { + "epoch": 7.780569514237856, + "grad_norm": 1.4879025220870972, + "learning_rate": 0.0002, + "loss": 1.0634, + "step": 9290 + }, + { + "epoch": 7.788944723618091, + "grad_norm": 1.3347378969192505, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 9300 + }, + { + "epoch": 7.797319932998325, + "grad_norm": 1.3727476596832275, + "learning_rate": 0.0002, + "loss": 1.017, + "step": 9310 + }, + { + "epoch": 7.80569514237856, + "grad_norm": 1.4126251935958862, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 9320 + }, + { + "epoch": 7.814070351758794, + "grad_norm": 1.2898106575012207, + "learning_rate": 0.0002, + "loss": 1.0237, + "step": 9330 + }, + { + "epoch": 7.822445561139029, + "grad_norm": 1.2732993364334106, + "learning_rate": 0.0002, + "loss": 1.0605, + "step": 9340 + }, + { + "epoch": 7.830820770519263, + "grad_norm": 1.1767915487289429, + "learning_rate": 0.0002, + "loss": 0.9851, + "step": 9350 + }, + { + "epoch": 7.839195979899498, + "grad_norm": 1.308590054512024, + "learning_rate": 0.0002, + "loss": 1.0196, + "step": 9360 + }, + { + "epoch": 7.847571189279732, + "grad_norm": 1.3947384357452393, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 9370 + }, + { + "epoch": 7.855946398659967, + "grad_norm": 1.3855421543121338, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 9380 + }, + { + "epoch": 7.864321608040201, + "grad_norm": 1.5742900371551514, + "learning_rate": 0.0002, + "loss": 1.0267, + "step": 9390 + }, + { + "epoch": 7.872696817420436, + "grad_norm": 1.4731863737106323, + "learning_rate": 0.0002, + "loss": 0.988, + "step": 9400 + }, + { + "epoch": 7.88107202680067, + "grad_norm": 1.5974364280700684, + "learning_rate": 0.0002, + "loss": 1.0387, + "step": 9410 + }, + { + "epoch": 7.889447236180905, + "grad_norm": 1.574455738067627, + "learning_rate": 0.0002, + "loss": 1.105, + "step": 9420 + }, + { + "epoch": 7.897822445561139, + "grad_norm": 1.3285928964614868, + "learning_rate": 0.0002, + "loss": 1.0818, + "step": 9430 + }, + { + "epoch": 7.906197654941374, + "grad_norm": 1.2003569602966309, + "learning_rate": 0.0002, + "loss": 1.0631, + "step": 9440 + }, + { + "epoch": 7.914572864321608, + "grad_norm": 1.2798550128936768, + "learning_rate": 0.0002, + "loss": 1.0911, + "step": 9450 + }, + { + "epoch": 7.922948073701843, + "grad_norm": 1.533443570137024, + "learning_rate": 0.0002, + "loss": 1.0306, + "step": 9460 + }, + { + "epoch": 7.931323283082077, + "grad_norm": 1.525195837020874, + "learning_rate": 0.0002, + "loss": 1.0484, + "step": 9470 + }, + { + "epoch": 7.939698492462312, + "grad_norm": 1.3638207912445068, + "learning_rate": 0.0002, + "loss": 1.0372, + "step": 9480 + }, + { + "epoch": 7.948073701842546, + "grad_norm": 1.4047036170959473, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 9490 + }, + { + "epoch": 7.956448911222781, + "grad_norm": 1.2534632682800293, + "learning_rate": 0.0002, + "loss": 1.0279, + "step": 9500 + }, + { + "epoch": 7.964824120603015, + "grad_norm": 1.4334971904754639, + "learning_rate": 0.0002, + "loss": 1.0109, + "step": 9510 + }, + { + "epoch": 7.97319932998325, + "grad_norm": 1.2948139905929565, + "learning_rate": 0.0002, + "loss": 1.0511, + "step": 9520 + }, + { + "epoch": 7.981574539363484, + "grad_norm": 1.3664277791976929, + "learning_rate": 0.0002, + "loss": 1.0856, + "step": 9530 + }, + { + "epoch": 7.989949748743719, + "grad_norm": 1.3293516635894775, + "learning_rate": 0.0002, + "loss": 1.0131, + "step": 9540 + }, + { + "epoch": 7.998324958123953, + "grad_norm": 1.5311461687088013, + "learning_rate": 0.0002, + "loss": 1.0864, + "step": 9550 + }, + { + "epoch": 8.0, + "eval_loss": 2.243375062942505, + "eval_runtime": 37.9136, + "eval_samples_per_second": 13.584, + "eval_steps_per_second": 1.714, + "step": 9552 + } + ], + "logging_steps": 10, + "max_steps": 9552, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.420451722057482e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-9552/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2176b2f082298306ecd4ddec265daba8d40b837f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db03202ff3d5e1dce5980463ad4d40fa9407d7d3624ffbc2fca0ad163b9f3c47 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f04b5a40e34f337981007975042f1a7d9ac08682 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 1194, "epoch_duration": 1316.3576848506927, "total_accumulated_duration": 1316.3576848506927, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}]} +{"epoch": 2.0, "step": 2388, "epoch_duration": 1324.833295583725, "total_accumulated_duration": 2641.1909804344177, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-1194", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}, {"eval_loss": 1.8088148832321167, "eval_runtime": 37.9609, "eval_samples_per_second": 13.567, "eval_steps_per_second": 1.712, "epoch": 1.0, "step": 1194}, {"loss": 1.7492, "grad_norm": 0.4256260097026825, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.35050156712532043, "learning_rate": 0.0002, "epoch": 1.0134003350083751, "step": 1210}, {"loss": 1.7422, "grad_norm": 0.34773948788642883, "learning_rate": 0.0002, "epoch": 1.0217755443886096, "step": 1220}, {"loss": 1.7803, "grad_norm": 0.35487470030784607, "learning_rate": 0.0002, "epoch": 1.0301507537688441, "step": 1230}, {"loss": 1.7095, "grad_norm": 0.37040361762046814, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1240}, {"loss": 1.7663, "grad_norm": 0.33740508556365967, "learning_rate": 0.0002, "epoch": 1.0469011725293131, "step": 1250}, {"loss": 1.7485, "grad_norm": 0.3962724506855011, "learning_rate": 0.0002, "epoch": 1.0552763819095476, "step": 1260}, {"loss": 1.7334, "grad_norm": 0.3129824101924896, "learning_rate": 0.0002, "epoch": 1.0636515912897822, "step": 1270}, {"loss": 1.8068, "grad_norm": 0.3620055019855499, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1280}, {"loss": 1.7823, "grad_norm": 0.3480982184410095, "learning_rate": 0.0002, "epoch": 1.0804020100502512, "step": 1290}, {"loss": 1.7081, "grad_norm": 0.344424843788147, "learning_rate": 0.0002, "epoch": 1.0887772194304857, "step": 1300}, {"loss": 1.7366, "grad_norm": 0.3480122685432434, "learning_rate": 0.0002, "epoch": 1.0971524288107202, "step": 1310}, {"loss": 1.7029, "grad_norm": 0.323662132024765, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1320}, {"loss": 1.7517, "grad_norm": 0.35440102219581604, "learning_rate": 0.0002, "epoch": 1.1139028475711892, "step": 1330}, {"loss": 1.7573, "grad_norm": 0.3342263698577881, "learning_rate": 0.0002, "epoch": 1.1222780569514237, "step": 1340}, {"loss": 1.7134, "grad_norm": 0.35705259442329407, "learning_rate": 0.0002, "epoch": 1.1306532663316582, "step": 1350}, {"loss": 1.64, "grad_norm": 0.38021907210350037, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1360}, {"loss": 1.66, "grad_norm": 0.34918731451034546, "learning_rate": 0.0002, "epoch": 1.1474036850921272, "step": 1370}, {"loss": 1.7628, "grad_norm": 0.371868371963501, "learning_rate": 0.0002, "epoch": 1.1557788944723617, "step": 1380}, {"loss": 1.725, "grad_norm": 0.38413912057876587, "learning_rate": 0.0002, "epoch": 1.1641541038525962, "step": 1390}, {"loss": 1.6948, "grad_norm": 0.3898005187511444, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1400}, {"loss": 1.8105, "grad_norm": 0.3726498484611511, "learning_rate": 0.0002, "epoch": 1.1809045226130652, "step": 1410}, {"loss": 1.7379, "grad_norm": 0.3532905876636505, "learning_rate": 0.0002, "epoch": 1.1892797319932997, "step": 1420}, {"loss": 1.6699, "grad_norm": 0.338127464056015, "learning_rate": 0.0002, "epoch": 1.1976549413735342, "step": 1430}, {"loss": 1.871, "grad_norm": 0.3472749888896942, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1440}, {"loss": 1.7092, "grad_norm": 0.3523476719856262, "learning_rate": 0.0002, "epoch": 1.2144053601340032, "step": 1450}, {"loss": 1.7329, "grad_norm": 0.42986124753952026, "learning_rate": 0.0002, "epoch": 1.2227805695142377, "step": 1460}, {"loss": 1.7459, "grad_norm": 0.38195517659187317, "learning_rate": 0.0002, "epoch": 1.2311557788944723, "step": 1470}, {"loss": 1.7539, "grad_norm": 0.31665122509002686, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1480}, {"loss": 1.7224, "grad_norm": 0.3539541959762573, "learning_rate": 0.0002, "epoch": 1.2479061976549413, "step": 1490}, {"loss": 1.7655, "grad_norm": 0.40162816643714905, "learning_rate": 0.0002, "epoch": 1.2562814070351758, "step": 1500}, {"loss": 1.702, "grad_norm": 0.34727150201797485, "learning_rate": 0.0002, "epoch": 1.2646566164154103, "step": 1510}, {"loss": 1.7804, "grad_norm": 0.3364993929862976, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1520}, {"loss": 1.8063, "grad_norm": 0.323483943939209, "learning_rate": 0.0002, "epoch": 1.2814070351758793, "step": 1530}, {"loss": 1.7622, "grad_norm": 0.4114733934402466, "learning_rate": 0.0002, "epoch": 1.2897822445561138, "step": 1540}, {"loss": 1.6525, "grad_norm": 0.37476620078086853, "learning_rate": 0.0002, "epoch": 1.2981574539363483, "step": 1550}, {"loss": 1.7225, "grad_norm": 0.4216269552707672, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1560}, {"loss": 1.6995, "grad_norm": 0.3204927444458008, "learning_rate": 0.0002, "epoch": 1.3149078726968173, "step": 1570}, {"loss": 1.7132, "grad_norm": 0.36916354298591614, "learning_rate": 0.0002, "epoch": 1.3232830820770518, "step": 1580}, {"loss": 1.7383, "grad_norm": 0.3755691647529602, "learning_rate": 0.0002, "epoch": 1.3316582914572863, "step": 1590}, {"loss": 1.7351, "grad_norm": 0.3688889443874359, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 1600}, {"loss": 1.7664, "grad_norm": 0.34306398034095764, "learning_rate": 0.0002, "epoch": 1.3484087102177553, "step": 1610}, {"loss": 1.6943, "grad_norm": 0.3651525676250458, "learning_rate": 0.0002, "epoch": 1.3567839195979898, "step": 1620}, {"loss": 1.7206, "grad_norm": 0.3461526036262512, "learning_rate": 0.0002, "epoch": 1.3651591289782243, "step": 1630}, {"loss": 1.728, "grad_norm": 0.37959185242652893, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 1640}, {"loss": 1.746, "grad_norm": 0.4005356431007385, "learning_rate": 0.0002, "epoch": 1.3819095477386933, "step": 1650}, {"loss": 1.694, "grad_norm": 0.3537434935569763, "learning_rate": 0.0002, "epoch": 1.3902847571189278, "step": 1660}, {"loss": 1.6679, "grad_norm": 0.38220855593681335, "learning_rate": 0.0002, "epoch": 1.3986599664991624, "step": 1670}, {"loss": 1.7721, "grad_norm": 0.3573434352874756, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 1680}, {"loss": 1.6983, "grad_norm": 0.40028059482574463, "learning_rate": 0.0002, "epoch": 1.4154103852596314, "step": 1690}, {"loss": 1.7049, "grad_norm": 0.3953610360622406, "learning_rate": 0.0002, "epoch": 1.4237855946398659, "step": 1700}, {"loss": 1.7126, "grad_norm": 0.39524543285369873, "learning_rate": 0.0002, "epoch": 1.4321608040201004, "step": 1710}, {"loss": 1.8319, "grad_norm": 0.37721359729766846, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 1720}, {"loss": 1.7387, "grad_norm": 0.4220093786716461, "learning_rate": 0.0002, "epoch": 1.4489112227805694, "step": 1730}, {"loss": 1.7495, "grad_norm": 0.3876369595527649, "learning_rate": 0.0002, "epoch": 1.457286432160804, "step": 1740}, {"loss": 1.6859, "grad_norm": 0.3774619400501251, "learning_rate": 0.0002, "epoch": 1.4656616415410384, "step": 1750}, {"loss": 1.7223, "grad_norm": 0.3608052432537079, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 1760}, {"loss": 1.6746, "grad_norm": 0.32083916664123535, "learning_rate": 0.0002, "epoch": 1.4824120603015074, "step": 1770}, {"loss": 1.716, "grad_norm": 0.32290884852409363, "learning_rate": 0.0002, "epoch": 1.490787269681742, "step": 1780}, {"loss": 1.7648, "grad_norm": 0.3537974953651428, "learning_rate": 0.0002, "epoch": 1.4991624790619764, "step": 1790}, {"loss": 1.6784, "grad_norm": 0.36576104164123535, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 1800}, {"loss": 1.6818, "grad_norm": 0.3336752653121948, "learning_rate": 0.0002, "epoch": 1.5159128978224454, "step": 1810}, {"loss": 1.7425, "grad_norm": 0.3551652431488037, "learning_rate": 0.0002, "epoch": 1.52428810720268, "step": 1820}, {"loss": 1.6997, "grad_norm": 0.43313586711883545, "learning_rate": 0.0002, "epoch": 1.5326633165829144, "step": 1830}, {"loss": 1.7358, "grad_norm": 0.39160311222076416, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 1840}, {"loss": 1.7709, "grad_norm": 0.38758179545402527, "learning_rate": 0.0002, "epoch": 1.5494137353433834, "step": 1850}, {"loss": 1.7768, "grad_norm": 0.3658832013607025, "learning_rate": 0.0002, "epoch": 1.557788944723618, "step": 1860}, {"loss": 1.7486, "grad_norm": 0.375372052192688, "learning_rate": 0.0002, "epoch": 1.5661641541038525, "step": 1870}, {"loss": 1.6555, "grad_norm": 0.3586942255496979, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 1880}, {"loss": 1.7314, "grad_norm": 0.3626467287540436, "learning_rate": 0.0002, "epoch": 1.5829145728643215, "step": 1890}, {"loss": 1.7943, "grad_norm": 0.4199363589286804, "learning_rate": 0.0002, "epoch": 1.591289782244556, "step": 1900}, {"loss": 1.6551, "grad_norm": 0.35646331310272217, "learning_rate": 0.0002, "epoch": 1.5996649916247905, "step": 1910}, {"loss": 1.7125, "grad_norm": 0.3465106189250946, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 1920}, {"loss": 1.8507, "grad_norm": 0.43392884731292725, "learning_rate": 0.0002, "epoch": 1.6164154103852595, "step": 1930}, {"loss": 1.7009, "grad_norm": 0.39187198877334595, "learning_rate": 0.0002, "epoch": 1.624790619765494, "step": 1940}, {"loss": 1.7202, "grad_norm": 0.3685080409049988, "learning_rate": 0.0002, "epoch": 1.6331658291457285, "step": 1950}, {"loss": 1.6607, "grad_norm": 0.4044491946697235, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 1960}, {"loss": 1.7234, "grad_norm": 0.4388049244880676, "learning_rate": 0.0002, "epoch": 1.6499162479061975, "step": 1970}, {"loss": 1.7178, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.658291457286432, "step": 1980}, {"loss": 1.75, "grad_norm": 0.3501148521900177, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1990}, {"loss": 1.7057, "grad_norm": 0.3751881718635559, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2000}, {"loss": 1.7209, "grad_norm": 0.3902788460254669, "learning_rate": 0.0002, "epoch": 1.6834170854271355, "step": 2010}, {"loss": 1.8517, "grad_norm": 0.39642134308815, "learning_rate": 0.0002, "epoch": 1.69179229480737, "step": 2020}, {"loss": 1.6623, "grad_norm": 0.35721203684806824, "learning_rate": 0.0002, "epoch": 1.7001675041876045, "step": 2030}, {"loss": 1.6988, "grad_norm": 0.360419899225235, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2040}, {"loss": 1.691, "grad_norm": 0.3755600154399872, "learning_rate": 0.0002, "epoch": 1.7169179229480735, "step": 2050}, {"loss": 1.6726, "grad_norm": 0.3939184844493866, "learning_rate": 0.0002, "epoch": 1.725293132328308, "step": 2060}, {"loss": 1.7326, "grad_norm": 0.33955490589141846, "learning_rate": 0.0002, "epoch": 1.7336683417085426, "step": 2070}, {"loss": 1.6794, "grad_norm": 0.35501939058303833, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2080}, {"loss": 1.7312, "grad_norm": 0.38298022747039795, "learning_rate": 0.0002, "epoch": 1.7504187604690116, "step": 2090}, {"loss": 1.6602, "grad_norm": 0.3472785949707031, "learning_rate": 0.0002, "epoch": 1.758793969849246, "step": 2100}, {"loss": 1.6671, "grad_norm": 0.3620430827140808, "learning_rate": 0.0002, "epoch": 1.7671691792294806, "step": 2110}, {"loss": 1.671, "grad_norm": 0.3795909881591797, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2120}, {"loss": 1.7193, "grad_norm": 0.3662523925304413, "learning_rate": 0.0002, "epoch": 1.7839195979899496, "step": 2130}, {"loss": 1.7764, "grad_norm": 0.4113886058330536, "learning_rate": 0.0002, "epoch": 1.792294807370184, "step": 2140}, {"loss": 1.6681, "grad_norm": 0.3765672743320465, "learning_rate": 0.0002, "epoch": 1.8006700167504186, "step": 2150}, {"loss": 1.7481, "grad_norm": 0.41623714566230774, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2160}, {"loss": 1.712, "grad_norm": 0.3724099099636078, "learning_rate": 0.0002, "epoch": 1.8174204355108876, "step": 2170}, {"loss": 1.6912, "grad_norm": 0.3990779221057892, "learning_rate": 0.0002, "epoch": 1.8257956448911221, "step": 2180}, {"loss": 1.7361, "grad_norm": 0.3677702844142914, "learning_rate": 0.0002, "epoch": 1.8341708542713566, "step": 2190}, {"loss": 1.6705, "grad_norm": 0.3944959342479706, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2200}, {"loss": 1.7619, "grad_norm": 0.3413957357406616, "learning_rate": 0.0002, "epoch": 1.8509212730318256, "step": 2210}, {"loss": 1.7069, "grad_norm": 0.40136098861694336, "learning_rate": 0.0002, "epoch": 1.8592964824120601, "step": 2220}, {"loss": 1.6865, "grad_norm": 0.3496319055557251, "learning_rate": 0.0002, "epoch": 1.8676716917922946, "step": 2230}, {"loss": 1.6906, "grad_norm": 0.3759860694408417, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2240}, {"loss": 1.8394, "grad_norm": 0.43556007742881775, "learning_rate": 0.0002, "epoch": 1.8844221105527639, "step": 2250}, {"loss": 1.66, "grad_norm": 0.3864828944206238, "learning_rate": 0.0002, "epoch": 1.8927973199329984, "step": 2260}, {"loss": 1.6502, "grad_norm": 0.396930456161499, "learning_rate": 0.0002, "epoch": 1.9011725293132329, "step": 2270}, {"loss": 1.838, "grad_norm": 0.37667879462242126, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2280}, {"loss": 1.7315, "grad_norm": 0.3539164066314697, "learning_rate": 0.0002, "epoch": 1.917922948073702, "step": 2290}, {"loss": 1.7589, "grad_norm": 0.40542101860046387, "learning_rate": 0.0002, "epoch": 1.9262981574539364, "step": 2300}, {"loss": 1.6795, "grad_norm": 0.37341606616973877, "learning_rate": 0.0002, "epoch": 1.934673366834171, "step": 2310}, {"loss": 1.7058, "grad_norm": 0.4011504352092743, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2320}, {"loss": 1.688, "grad_norm": 0.37934592366218567, "learning_rate": 0.0002, "epoch": 1.95142378559464, "step": 2330}, {"loss": 1.6699, "grad_norm": 0.32745009660720825, "learning_rate": 0.0002, "epoch": 1.9597989949748744, "step": 2340}, {"loss": 1.7673, "grad_norm": 0.38347750902175903, "learning_rate": 0.0002, "epoch": 1.968174204355109, "step": 2350}, {"loss": 1.7116, "grad_norm": 0.3945120871067047, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2360}, {"loss": 1.7559, "grad_norm": 0.4034058749675751, "learning_rate": 0.0002, "epoch": 1.984924623115578, "step": 2370}, {"loss": 1.7254, "grad_norm": 0.3546718955039978, "learning_rate": 0.0002, "epoch": 1.9932998324958124, "step": 2380}]} +{"epoch": 3.0, "step": 3582, "epoch_duration": 1321.083755493164, "total_accumulated_duration": 3962.274735927582, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}, {"eval_loss": 1.8088148832321167, "eval_runtime": 37.9609, "eval_samples_per_second": 13.567, "eval_steps_per_second": 1.712, "epoch": 1.0, "step": 1194}, {"loss": 1.7492, "grad_norm": 0.4256260097026825, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.35050156712532043, "learning_rate": 0.0002, "epoch": 1.0134003350083751, "step": 1210}, {"loss": 1.7422, "grad_norm": 0.34773948788642883, "learning_rate": 0.0002, "epoch": 1.0217755443886096, "step": 1220}, {"loss": 1.7803, "grad_norm": 0.35487470030784607, "learning_rate": 0.0002, "epoch": 1.0301507537688441, "step": 1230}, {"loss": 1.7095, "grad_norm": 0.37040361762046814, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1240}, {"loss": 1.7663, "grad_norm": 0.33740508556365967, "learning_rate": 0.0002, "epoch": 1.0469011725293131, "step": 1250}, {"loss": 1.7485, "grad_norm": 0.3962724506855011, "learning_rate": 0.0002, "epoch": 1.0552763819095476, "step": 1260}, {"loss": 1.7334, "grad_norm": 0.3129824101924896, "learning_rate": 0.0002, "epoch": 1.0636515912897822, "step": 1270}, {"loss": 1.8068, "grad_norm": 0.3620055019855499, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1280}, {"loss": 1.7823, "grad_norm": 0.3480982184410095, "learning_rate": 0.0002, "epoch": 1.0804020100502512, "step": 1290}, {"loss": 1.7081, "grad_norm": 0.344424843788147, "learning_rate": 0.0002, "epoch": 1.0887772194304857, "step": 1300}, {"loss": 1.7366, "grad_norm": 0.3480122685432434, "learning_rate": 0.0002, "epoch": 1.0971524288107202, "step": 1310}, {"loss": 1.7029, "grad_norm": 0.323662132024765, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1320}, {"loss": 1.7517, "grad_norm": 0.35440102219581604, "learning_rate": 0.0002, "epoch": 1.1139028475711892, "step": 1330}, {"loss": 1.7573, "grad_norm": 0.3342263698577881, "learning_rate": 0.0002, "epoch": 1.1222780569514237, "step": 1340}, {"loss": 1.7134, "grad_norm": 0.35705259442329407, "learning_rate": 0.0002, "epoch": 1.1306532663316582, "step": 1350}, {"loss": 1.64, "grad_norm": 0.38021907210350037, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1360}, {"loss": 1.66, "grad_norm": 0.34918731451034546, "learning_rate": 0.0002, "epoch": 1.1474036850921272, "step": 1370}, {"loss": 1.7628, "grad_norm": 0.371868371963501, "learning_rate": 0.0002, "epoch": 1.1557788944723617, "step": 1380}, {"loss": 1.725, "grad_norm": 0.38413912057876587, "learning_rate": 0.0002, "epoch": 1.1641541038525962, "step": 1390}, {"loss": 1.6948, "grad_norm": 0.3898005187511444, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1400}, {"loss": 1.8105, "grad_norm": 0.3726498484611511, "learning_rate": 0.0002, "epoch": 1.1809045226130652, "step": 1410}, {"loss": 1.7379, "grad_norm": 0.3532905876636505, "learning_rate": 0.0002, "epoch": 1.1892797319932997, "step": 1420}, {"loss": 1.6699, "grad_norm": 0.338127464056015, "learning_rate": 0.0002, "epoch": 1.1976549413735342, "step": 1430}, {"loss": 1.871, "grad_norm": 0.3472749888896942, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1440}, {"loss": 1.7092, "grad_norm": 0.3523476719856262, "learning_rate": 0.0002, "epoch": 1.2144053601340032, "step": 1450}, {"loss": 1.7329, "grad_norm": 0.42986124753952026, "learning_rate": 0.0002, "epoch": 1.2227805695142377, "step": 1460}, {"loss": 1.7459, "grad_norm": 0.38195517659187317, "learning_rate": 0.0002, "epoch": 1.2311557788944723, "step": 1470}, {"loss": 1.7539, "grad_norm": 0.31665122509002686, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1480}, {"loss": 1.7224, "grad_norm": 0.3539541959762573, "learning_rate": 0.0002, "epoch": 1.2479061976549413, "step": 1490}, {"loss": 1.7655, "grad_norm": 0.40162816643714905, "learning_rate": 0.0002, "epoch": 1.2562814070351758, "step": 1500}, {"loss": 1.702, "grad_norm": 0.34727150201797485, "learning_rate": 0.0002, "epoch": 1.2646566164154103, "step": 1510}, {"loss": 1.7804, "grad_norm": 0.3364993929862976, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1520}, {"loss": 1.8063, "grad_norm": 0.323483943939209, "learning_rate": 0.0002, "epoch": 1.2814070351758793, "step": 1530}, {"loss": 1.7622, "grad_norm": 0.4114733934402466, "learning_rate": 0.0002, "epoch": 1.2897822445561138, "step": 1540}, {"loss": 1.6525, "grad_norm": 0.37476620078086853, "learning_rate": 0.0002, "epoch": 1.2981574539363483, "step": 1550}, {"loss": 1.7225, "grad_norm": 0.4216269552707672, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1560}, {"loss": 1.6995, "grad_norm": 0.3204927444458008, "learning_rate": 0.0002, "epoch": 1.3149078726968173, "step": 1570}, {"loss": 1.7132, "grad_norm": 0.36916354298591614, "learning_rate": 0.0002, "epoch": 1.3232830820770518, "step": 1580}, {"loss": 1.7383, "grad_norm": 0.3755691647529602, "learning_rate": 0.0002, "epoch": 1.3316582914572863, "step": 1590}, {"loss": 1.7351, "grad_norm": 0.3688889443874359, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 1600}, {"loss": 1.7664, "grad_norm": 0.34306398034095764, "learning_rate": 0.0002, "epoch": 1.3484087102177553, "step": 1610}, {"loss": 1.6943, "grad_norm": 0.3651525676250458, "learning_rate": 0.0002, "epoch": 1.3567839195979898, "step": 1620}, {"loss": 1.7206, "grad_norm": 0.3461526036262512, "learning_rate": 0.0002, "epoch": 1.3651591289782243, "step": 1630}, {"loss": 1.728, "grad_norm": 0.37959185242652893, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 1640}, {"loss": 1.746, "grad_norm": 0.4005356431007385, "learning_rate": 0.0002, "epoch": 1.3819095477386933, "step": 1650}, {"loss": 1.694, "grad_norm": 0.3537434935569763, "learning_rate": 0.0002, "epoch": 1.3902847571189278, "step": 1660}, {"loss": 1.6679, "grad_norm": 0.38220855593681335, "learning_rate": 0.0002, "epoch": 1.3986599664991624, "step": 1670}, {"loss": 1.7721, "grad_norm": 0.3573434352874756, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 1680}, {"loss": 1.6983, "grad_norm": 0.40028059482574463, "learning_rate": 0.0002, "epoch": 1.4154103852596314, "step": 1690}, {"loss": 1.7049, "grad_norm": 0.3953610360622406, "learning_rate": 0.0002, "epoch": 1.4237855946398659, "step": 1700}, {"loss": 1.7126, "grad_norm": 0.39524543285369873, "learning_rate": 0.0002, "epoch": 1.4321608040201004, "step": 1710}, {"loss": 1.8319, "grad_norm": 0.37721359729766846, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 1720}, {"loss": 1.7387, "grad_norm": 0.4220093786716461, "learning_rate": 0.0002, "epoch": 1.4489112227805694, "step": 1730}, {"loss": 1.7495, "grad_norm": 0.3876369595527649, "learning_rate": 0.0002, "epoch": 1.457286432160804, "step": 1740}, {"loss": 1.6859, "grad_norm": 0.3774619400501251, "learning_rate": 0.0002, "epoch": 1.4656616415410384, "step": 1750}, {"loss": 1.7223, "grad_norm": 0.3608052432537079, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 1760}, {"loss": 1.6746, "grad_norm": 0.32083916664123535, "learning_rate": 0.0002, "epoch": 1.4824120603015074, "step": 1770}, {"loss": 1.716, "grad_norm": 0.32290884852409363, "learning_rate": 0.0002, "epoch": 1.490787269681742, "step": 1780}, {"loss": 1.7648, "grad_norm": 0.3537974953651428, "learning_rate": 0.0002, "epoch": 1.4991624790619764, "step": 1790}, {"loss": 1.6784, "grad_norm": 0.36576104164123535, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 1800}, {"loss": 1.6818, "grad_norm": 0.3336752653121948, "learning_rate": 0.0002, "epoch": 1.5159128978224454, "step": 1810}, {"loss": 1.7425, "grad_norm": 0.3551652431488037, "learning_rate": 0.0002, "epoch": 1.52428810720268, "step": 1820}, {"loss": 1.6997, "grad_norm": 0.43313586711883545, "learning_rate": 0.0002, "epoch": 1.5326633165829144, "step": 1830}, {"loss": 1.7358, "grad_norm": 0.39160311222076416, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 1840}, {"loss": 1.7709, "grad_norm": 0.38758179545402527, "learning_rate": 0.0002, "epoch": 1.5494137353433834, "step": 1850}, {"loss": 1.7768, "grad_norm": 0.3658832013607025, "learning_rate": 0.0002, "epoch": 1.557788944723618, "step": 1860}, {"loss": 1.7486, "grad_norm": 0.375372052192688, "learning_rate": 0.0002, "epoch": 1.5661641541038525, "step": 1870}, {"loss": 1.6555, "grad_norm": 0.3586942255496979, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 1880}, {"loss": 1.7314, "grad_norm": 0.3626467287540436, "learning_rate": 0.0002, "epoch": 1.5829145728643215, "step": 1890}, {"loss": 1.7943, "grad_norm": 0.4199363589286804, "learning_rate": 0.0002, "epoch": 1.591289782244556, "step": 1900}, {"loss": 1.6551, "grad_norm": 0.35646331310272217, "learning_rate": 0.0002, "epoch": 1.5996649916247905, "step": 1910}, {"loss": 1.7125, "grad_norm": 0.3465106189250946, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 1920}, {"loss": 1.8507, "grad_norm": 0.43392884731292725, "learning_rate": 0.0002, "epoch": 1.6164154103852595, "step": 1930}, {"loss": 1.7009, "grad_norm": 0.39187198877334595, "learning_rate": 0.0002, "epoch": 1.624790619765494, "step": 1940}, {"loss": 1.7202, "grad_norm": 0.3685080409049988, "learning_rate": 0.0002, "epoch": 1.6331658291457285, "step": 1950}, {"loss": 1.6607, "grad_norm": 0.4044491946697235, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 1960}, {"loss": 1.7234, "grad_norm": 0.4388049244880676, "learning_rate": 0.0002, "epoch": 1.6499162479061975, "step": 1970}, {"loss": 1.7178, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.658291457286432, "step": 1980}, {"loss": 1.75, "grad_norm": 0.3501148521900177, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1990}, {"loss": 1.7057, "grad_norm": 0.3751881718635559, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2000}, {"loss": 1.7209, "grad_norm": 0.3902788460254669, "learning_rate": 0.0002, "epoch": 1.6834170854271355, "step": 2010}, {"loss": 1.8517, "grad_norm": 0.39642134308815, "learning_rate": 0.0002, "epoch": 1.69179229480737, "step": 2020}, {"loss": 1.6623, "grad_norm": 0.35721203684806824, "learning_rate": 0.0002, "epoch": 1.7001675041876045, "step": 2030}, {"loss": 1.6988, "grad_norm": 0.360419899225235, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2040}, {"loss": 1.691, "grad_norm": 0.3755600154399872, "learning_rate": 0.0002, "epoch": 1.7169179229480735, "step": 2050}, {"loss": 1.6726, "grad_norm": 0.3939184844493866, "learning_rate": 0.0002, "epoch": 1.725293132328308, "step": 2060}, {"loss": 1.7326, "grad_norm": 0.33955490589141846, "learning_rate": 0.0002, "epoch": 1.7336683417085426, "step": 2070}, {"loss": 1.6794, "grad_norm": 0.35501939058303833, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2080}, {"loss": 1.7312, "grad_norm": 0.38298022747039795, "learning_rate": 0.0002, "epoch": 1.7504187604690116, "step": 2090}, {"loss": 1.6602, "grad_norm": 0.3472785949707031, "learning_rate": 0.0002, "epoch": 1.758793969849246, "step": 2100}, {"loss": 1.6671, "grad_norm": 0.3620430827140808, "learning_rate": 0.0002, "epoch": 1.7671691792294806, "step": 2110}, {"loss": 1.671, "grad_norm": 0.3795909881591797, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2120}, {"loss": 1.7193, "grad_norm": 0.3662523925304413, "learning_rate": 0.0002, "epoch": 1.7839195979899496, "step": 2130}, {"loss": 1.7764, "grad_norm": 0.4113886058330536, "learning_rate": 0.0002, "epoch": 1.792294807370184, "step": 2140}, {"loss": 1.6681, "grad_norm": 0.3765672743320465, "learning_rate": 0.0002, "epoch": 1.8006700167504186, "step": 2150}, {"loss": 1.7481, "grad_norm": 0.41623714566230774, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2160}, {"loss": 1.712, "grad_norm": 0.3724099099636078, "learning_rate": 0.0002, "epoch": 1.8174204355108876, "step": 2170}, {"loss": 1.6912, "grad_norm": 0.3990779221057892, "learning_rate": 0.0002, "epoch": 1.8257956448911221, "step": 2180}, {"loss": 1.7361, "grad_norm": 0.3677702844142914, "learning_rate": 0.0002, "epoch": 1.8341708542713566, "step": 2190}, {"loss": 1.6705, "grad_norm": 0.3944959342479706, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2200}, {"loss": 1.7619, "grad_norm": 0.3413957357406616, "learning_rate": 0.0002, "epoch": 1.8509212730318256, "step": 2210}, {"loss": 1.7069, "grad_norm": 0.40136098861694336, "learning_rate": 0.0002, "epoch": 1.8592964824120601, "step": 2220}, {"loss": 1.6865, "grad_norm": 0.3496319055557251, "learning_rate": 0.0002, "epoch": 1.8676716917922946, "step": 2230}, {"loss": 1.6906, "grad_norm": 0.3759860694408417, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2240}, {"loss": 1.8394, "grad_norm": 0.43556007742881775, "learning_rate": 0.0002, "epoch": 1.8844221105527639, "step": 2250}, {"loss": 1.66, "grad_norm": 0.3864828944206238, "learning_rate": 0.0002, "epoch": 1.8927973199329984, "step": 2260}, {"loss": 1.6502, "grad_norm": 0.396930456161499, "learning_rate": 0.0002, "epoch": 1.9011725293132329, "step": 2270}, {"loss": 1.838, "grad_norm": 0.37667879462242126, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2280}, {"loss": 1.7315, "grad_norm": 0.3539164066314697, "learning_rate": 0.0002, "epoch": 1.917922948073702, "step": 2290}, {"loss": 1.7589, "grad_norm": 0.40542101860046387, "learning_rate": 0.0002, "epoch": 1.9262981574539364, "step": 2300}, {"loss": 1.6795, "grad_norm": 0.37341606616973877, "learning_rate": 0.0002, "epoch": 1.934673366834171, "step": 2310}, {"loss": 1.7058, "grad_norm": 0.4011504352092743, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2320}, {"loss": 1.688, "grad_norm": 0.37934592366218567, "learning_rate": 0.0002, "epoch": 1.95142378559464, "step": 2330}, {"loss": 1.6699, "grad_norm": 0.32745009660720825, "learning_rate": 0.0002, "epoch": 1.9597989949748744, "step": 2340}, {"loss": 1.7673, "grad_norm": 0.38347750902175903, "learning_rate": 0.0002, "epoch": 1.968174204355109, "step": 2350}, {"loss": 1.7116, "grad_norm": 0.3945120871067047, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2360}, {"loss": 1.7559, "grad_norm": 0.4034058749675751, "learning_rate": 0.0002, "epoch": 1.984924623115578, "step": 2370}, {"loss": 1.7254, "grad_norm": 0.3546718955039978, "learning_rate": 0.0002, "epoch": 1.9932998324958124, "step": 2380}, {"eval_loss": 1.8061236143112183, "eval_runtime": 38.2113, "eval_samples_per_second": 13.478, "eval_steps_per_second": 1.701, "epoch": 2.0, "step": 2388}, {"loss": 1.7203, "grad_norm": 0.35184019804000854, "learning_rate": 0.0002, "epoch": 2.0016750418760467, "step": 2390}, {"loss": 1.6124, "grad_norm": 0.40416669845581055, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 2400}, {"loss": 1.6092, "grad_norm": 0.3824569880962372, "learning_rate": 0.0002, "epoch": 2.0184254606365157, "step": 2410}, {"loss": 1.641, "grad_norm": 0.42036163806915283, "learning_rate": 0.0002, "epoch": 2.0268006700167502, "step": 2420}, {"loss": 1.6176, "grad_norm": 0.40417996048927307, "learning_rate": 0.0002, "epoch": 2.0351758793969847, "step": 2430}, {"loss": 1.643, "grad_norm": 0.45298922061920166, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 2440}, {"loss": 1.653, "grad_norm": 0.48289841413497925, "learning_rate": 0.0002, "epoch": 2.0519262981574538, "step": 2450}, {"loss": 1.5275, "grad_norm": 0.43702399730682373, "learning_rate": 0.0002, "epoch": 2.0603015075376883, "step": 2460}, {"loss": 1.5825, "grad_norm": 0.49487054347991943, "learning_rate": 0.0002, "epoch": 2.0686767169179228, "step": 2470}, {"loss": 1.6552, "grad_norm": 0.40030500292778015, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 2480}, {"loss": 1.614, "grad_norm": 0.4664880037307739, "learning_rate": 0.0002, "epoch": 2.0854271356783918, "step": 2490}, {"loss": 1.6589, "grad_norm": 0.4111400842666626, "learning_rate": 0.0002, "epoch": 2.0938023450586263, "step": 2500}, {"loss": 1.5788, "grad_norm": 0.4155750572681427, "learning_rate": 0.0002, "epoch": 2.102177554438861, "step": 2510}, {"loss": 1.598, "grad_norm": 0.39257505536079407, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 2520}, {"loss": 1.65, "grad_norm": 0.4156777560710907, "learning_rate": 0.0002, "epoch": 2.11892797319933, "step": 2530}, {"loss": 1.6695, "grad_norm": 0.4025181233882904, "learning_rate": 0.0002, "epoch": 2.1273031825795643, "step": 2540}, {"loss": 1.6471, "grad_norm": 0.42347562313079834, "learning_rate": 0.0002, "epoch": 2.135678391959799, "step": 2550}, {"loss": 1.6014, "grad_norm": 0.47068294882774353, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 2560}, {"loss": 1.6468, "grad_norm": 0.44081777334213257, "learning_rate": 0.0002, "epoch": 2.152428810720268, "step": 2570}, {"loss": 1.641, "grad_norm": 0.44823798537254333, "learning_rate": 0.0002, "epoch": 2.1608040201005023, "step": 2580}, {"loss": 1.6287, "grad_norm": 0.40486326813697815, "learning_rate": 0.0002, "epoch": 2.169179229480737, "step": 2590}, {"loss": 1.6198, "grad_norm": 0.454236775636673, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 2600}, {"loss": 1.5885, "grad_norm": 0.42555344104766846, "learning_rate": 0.0002, "epoch": 2.185929648241206, "step": 2610}, {"loss": 1.6348, "grad_norm": 0.5607381463050842, "learning_rate": 0.0002, "epoch": 2.1943048576214403, "step": 2620}, {"loss": 1.6343, "grad_norm": 0.4095611870288849, "learning_rate": 0.0002, "epoch": 2.202680067001675, "step": 2630}, {"loss": 1.5584, "grad_norm": 0.419342577457428, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 2640}, {"loss": 1.5425, "grad_norm": 0.48541849851608276, "learning_rate": 0.0002, "epoch": 2.219430485762144, "step": 2650}, {"loss": 1.6233, "grad_norm": 0.4365246891975403, "learning_rate": 0.0002, "epoch": 2.2278056951423784, "step": 2660}, {"loss": 1.6886, "grad_norm": 0.46417000889778137, "learning_rate": 0.0002, "epoch": 2.236180904522613, "step": 2670}, {"loss": 1.6345, "grad_norm": 0.5034580230712891, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 2680}, {"loss": 1.5992, "grad_norm": 0.44852879643440247, "learning_rate": 0.0002, "epoch": 2.2529313232830823, "step": 2690}, {"loss": 1.6152, "grad_norm": 0.43886998295783997, "learning_rate": 0.0002, "epoch": 2.2613065326633164, "step": 2700}, {"loss": 1.6533, "grad_norm": 0.45762625336647034, "learning_rate": 0.0002, "epoch": 2.2696817420435513, "step": 2710}, {"loss": 1.5889, "grad_norm": 0.39429017901420593, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 2720}, {"loss": 1.6419, "grad_norm": 0.4420442581176758, "learning_rate": 0.0002, "epoch": 2.2864321608040203, "step": 2730}, {"loss": 1.6126, "grad_norm": 0.4327794015407562, "learning_rate": 0.0002, "epoch": 2.2948073701842544, "step": 2740}, {"loss": 1.6405, "grad_norm": 0.4303780198097229, "learning_rate": 0.0002, "epoch": 2.3031825795644894, "step": 2750}, {"loss": 1.6362, "grad_norm": 0.41379377245903015, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 2760}, {"loss": 1.6744, "grad_norm": 0.4821205735206604, "learning_rate": 0.0002, "epoch": 2.3199329983249584, "step": 2770}, {"loss": 1.6694, "grad_norm": 0.46232181787490845, "learning_rate": 0.0002, "epoch": 2.3283082077051924, "step": 2780}, {"loss": 1.6341, "grad_norm": 0.44937554001808167, "learning_rate": 0.0002, "epoch": 2.3366834170854274, "step": 2790}, {"loss": 1.6556, "grad_norm": 0.443250447511673, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 2800}, {"loss": 1.6874, "grad_norm": 0.4687805473804474, "learning_rate": 0.0002, "epoch": 2.3534338358458964, "step": 2810}, {"loss": 1.6445, "grad_norm": 0.435031920671463, "learning_rate": 0.0002, "epoch": 2.3618090452261304, "step": 2820}, {"loss": 1.6335, "grad_norm": 0.4949858784675598, "learning_rate": 0.0002, "epoch": 2.3701842546063654, "step": 2830}, {"loss": 1.6803, "grad_norm": 0.46349018812179565, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 2840}, {"loss": 1.6586, "grad_norm": 0.46377238631248474, "learning_rate": 0.0002, "epoch": 2.3869346733668344, "step": 2850}, {"loss": 1.5384, "grad_norm": 0.6111940741539001, "learning_rate": 0.0002, "epoch": 2.3953098827470685, "step": 2860}, {"loss": 1.6132, "grad_norm": 0.45090532302856445, "learning_rate": 0.0002, "epoch": 2.4036850921273034, "step": 2870}, {"loss": 1.6047, "grad_norm": 0.4762120842933655, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 2880}, {"loss": 1.6997, "grad_norm": 0.4397919774055481, "learning_rate": 0.0002, "epoch": 2.4204355108877724, "step": 2890}, {"loss": 1.6369, "grad_norm": 0.4765152335166931, "learning_rate": 0.0002, "epoch": 2.4288107202680065, "step": 2900}, {"loss": 1.5982, "grad_norm": 0.4347304403781891, "learning_rate": 0.0002, "epoch": 2.4371859296482414, "step": 2910}, {"loss": 1.6409, "grad_norm": 0.3918324410915375, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 2920}, {"loss": 1.5354, "grad_norm": 0.43932855129241943, "learning_rate": 0.0002, "epoch": 2.4539363484087104, "step": 2930}, {"loss": 1.6283, "grad_norm": 0.46946918964385986, "learning_rate": 0.0002, "epoch": 2.4623115577889445, "step": 2940}, {"loss": 1.6622, "grad_norm": 0.45169174671173096, "learning_rate": 0.0002, "epoch": 2.4706867671691795, "step": 2950}, {"loss": 1.6386, "grad_norm": 0.43488186597824097, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 2960}, {"loss": 1.6187, "grad_norm": 0.42297765612602234, "learning_rate": 0.0002, "epoch": 2.4874371859296485, "step": 2970}, {"loss": 1.5708, "grad_norm": 0.4546392560005188, "learning_rate": 0.0002, "epoch": 2.4958123953098825, "step": 2980}, {"loss": 1.5944, "grad_norm": 0.4236692488193512, "learning_rate": 0.0002, "epoch": 2.5041876046901175, "step": 2990}, {"loss": 1.6927, "grad_norm": 0.46421024203300476, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3000}, {"loss": 1.6686, "grad_norm": 0.5040220618247986, "learning_rate": 0.0002, "epoch": 2.5209380234505865, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4596138894557953, "learning_rate": 0.0002, "epoch": 2.5293132328308205, "step": 3020}, {"loss": 1.5936, "grad_norm": 0.4410228729248047, "learning_rate": 0.0002, "epoch": 2.5376884422110555, "step": 3030}, {"loss": 1.6336, "grad_norm": 0.553693413734436, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3040}, {"loss": 1.6377, "grad_norm": 0.41298043727874756, "learning_rate": 0.0002, "epoch": 2.5544388609715245, "step": 3050}, {"loss": 1.7196, "grad_norm": 0.4894513487815857, "learning_rate": 0.0002, "epoch": 2.5628140703517586, "step": 3060}, {"loss": 1.6106, "grad_norm": 0.5525603294372559, "learning_rate": 0.0002, "epoch": 2.5711892797319935, "step": 3070}, {"loss": 1.6089, "grad_norm": 0.5043630003929138, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3080}, {"loss": 1.5641, "grad_norm": 0.4690920412540436, "learning_rate": 0.0002, "epoch": 2.5879396984924625, "step": 3090}, {"loss": 1.6364, "grad_norm": 0.4358677566051483, "learning_rate": 0.0002, "epoch": 2.5963149078726966, "step": 3100}, {"loss": 1.6328, "grad_norm": 0.4621894061565399, "learning_rate": 0.0002, "epoch": 2.6046901172529315, "step": 3110}, {"loss": 1.7426, "grad_norm": 0.4639507532119751, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3120}, {"loss": 1.6492, "grad_norm": 0.45161309838294983, "learning_rate": 0.0002, "epoch": 2.6214405360134005, "step": 3130}, {"loss": 1.6221, "grad_norm": 0.49179261922836304, "learning_rate": 0.0002, "epoch": 2.6298157453936346, "step": 3140}, {"loss": 1.663, "grad_norm": 0.4739720821380615, "learning_rate": 0.0002, "epoch": 2.6381909547738696, "step": 3150}, {"loss": 1.616, "grad_norm": 0.468252956867218, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3160}, {"loss": 1.705, "grad_norm": 0.44691553711891174, "learning_rate": 0.0002, "epoch": 2.6549413735343386, "step": 3170}, {"loss": 1.6558, "grad_norm": 0.47537046670913696, "learning_rate": 0.0002, "epoch": 2.6633165829145726, "step": 3180}, {"loss": 1.6755, "grad_norm": 0.4445202052593231, "learning_rate": 0.0002, "epoch": 2.6716917922948076, "step": 3190}, {"loss": 1.6522, "grad_norm": 0.46785518527030945, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 3200}, {"loss": 1.6711, "grad_norm": 0.4807088077068329, "learning_rate": 0.0002, "epoch": 2.6884422110552766, "step": 3210}, {"loss": 1.6385, "grad_norm": 0.4547516703605652, "learning_rate": 0.0002, "epoch": 2.6968174204355106, "step": 3220}, {"loss": 1.6084, "grad_norm": 0.5200821161270142, "learning_rate": 0.0002, "epoch": 2.7051926298157456, "step": 3230}, {"loss": 1.6434, "grad_norm": 0.4915551245212555, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 3240}, {"loss": 1.6146, "grad_norm": 0.4324817955493927, "learning_rate": 0.0002, "epoch": 2.7219430485762146, "step": 3250}, {"loss": 1.6154, "grad_norm": 0.6290464997291565, "learning_rate": 0.0002, "epoch": 2.7303182579564487, "step": 3260}, {"loss": 1.611, "grad_norm": 0.42255541682243347, "learning_rate": 0.0002, "epoch": 2.7386934673366836, "step": 3270}, {"loss": 1.6345, "grad_norm": 0.47089505195617676, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 3280}, {"loss": 1.6357, "grad_norm": 0.4492960572242737, "learning_rate": 0.0002, "epoch": 2.7554438860971526, "step": 3290}, {"loss": 1.652, "grad_norm": 0.4711938202381134, "learning_rate": 0.0002, "epoch": 2.7638190954773867, "step": 3300}, {"loss": 1.6107, "grad_norm": 0.4635316729545593, "learning_rate": 0.0002, "epoch": 2.7721943048576216, "step": 3310}, {"loss": 1.6044, "grad_norm": 0.4207742512226105, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 3320}, {"loss": 1.6163, "grad_norm": 0.5545504093170166, "learning_rate": 0.0002, "epoch": 2.7889447236180906, "step": 3330}, {"loss": 1.6642, "grad_norm": 0.46976953744888306, "learning_rate": 0.0002, "epoch": 2.7973199329983247, "step": 3340}, {"loss": 1.6879, "grad_norm": 0.4805937111377716, "learning_rate": 0.0002, "epoch": 2.8056951423785597, "step": 3350}, {"loss": 1.6185, "grad_norm": 0.4986467659473419, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 3360}, {"loss": 1.6125, "grad_norm": 0.44702932238578796, "learning_rate": 0.0002, "epoch": 2.8224455611390287, "step": 3370}, {"loss": 1.6318, "grad_norm": 0.4698854088783264, "learning_rate": 0.0002, "epoch": 2.8308207705192627, "step": 3380}, {"loss": 1.6468, "grad_norm": 0.5756528377532959, "learning_rate": 0.0002, "epoch": 2.8391959798994977, "step": 3390}, {"loss": 1.6783, "grad_norm": 0.4266531765460968, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 3400}, {"loss": 1.6351, "grad_norm": 0.5342442989349365, "learning_rate": 0.0002, "epoch": 2.8559463986599667, "step": 3410}, {"loss": 1.659, "grad_norm": 0.47210443019866943, "learning_rate": 0.0002, "epoch": 2.8643216080402008, "step": 3420}, {"loss": 1.6157, "grad_norm": 0.4491795599460602, "learning_rate": 0.0002, "epoch": 2.8726968174204357, "step": 3430}, {"loss": 1.6179, "grad_norm": 0.5387647151947021, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 3440}, {"loss": 1.6415, "grad_norm": 0.5059208273887634, "learning_rate": 0.0002, "epoch": 2.8894472361809047, "step": 3450}, {"loss": 1.6577, "grad_norm": 0.472605437040329, "learning_rate": 0.0002, "epoch": 2.8978224455611388, "step": 3460}, {"loss": 1.6831, "grad_norm": 0.499795138835907, "learning_rate": 0.0002, "epoch": 2.9061976549413737, "step": 3470}, {"loss": 1.6198, "grad_norm": 0.4887969493865967, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4670022130012512, "learning_rate": 0.0002, "epoch": 2.9229480737018427, "step": 3490}, {"loss": 1.6355, "grad_norm": 0.4475444555282593, "learning_rate": 0.0002, "epoch": 2.931323283082077, "step": 3500}, {"loss": 1.6669, "grad_norm": 0.39244669675827026, "learning_rate": 0.0002, "epoch": 2.9396984924623117, "step": 3510}, {"loss": 1.6094, "grad_norm": 0.4905056059360504, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 3520}, {"loss": 1.5774, "grad_norm": 0.4395551085472107, "learning_rate": 0.0002, "epoch": 2.9564489112227808, "step": 3530}, {"loss": 1.6047, "grad_norm": 0.4693661034107208, "learning_rate": 0.0002, "epoch": 2.964824120603015, "step": 3540}, {"loss": 1.648, "grad_norm": 0.473781943321228, "learning_rate": 0.0002, "epoch": 2.9731993299832498, "step": 3550}, {"loss": 1.7056, "grad_norm": 0.4374050796031952, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 3560}, {"loss": 1.6816, "grad_norm": 0.46144190430641174, "learning_rate": 0.0002, "epoch": 2.9899497487437188, "step": 3570}, {"loss": 1.5454, "grad_norm": 0.43887680768966675, "learning_rate": 0.0002, "epoch": 2.998324958123953, "step": 3580}]} +{"epoch": 4.0, "step": 4776, "epoch_duration": 1336.55681681633, "total_accumulated_duration": 5298.831552743912, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}, {"eval_loss": 1.8088148832321167, "eval_runtime": 37.9609, "eval_samples_per_second": 13.567, "eval_steps_per_second": 1.712, "epoch": 1.0, "step": 1194}, {"loss": 1.7492, "grad_norm": 0.4256260097026825, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.35050156712532043, "learning_rate": 0.0002, "epoch": 1.0134003350083751, "step": 1210}, {"loss": 1.7422, "grad_norm": 0.34773948788642883, "learning_rate": 0.0002, "epoch": 1.0217755443886096, "step": 1220}, {"loss": 1.7803, "grad_norm": 0.35487470030784607, "learning_rate": 0.0002, "epoch": 1.0301507537688441, "step": 1230}, {"loss": 1.7095, "grad_norm": 0.37040361762046814, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1240}, {"loss": 1.7663, "grad_norm": 0.33740508556365967, "learning_rate": 0.0002, "epoch": 1.0469011725293131, "step": 1250}, {"loss": 1.7485, "grad_norm": 0.3962724506855011, "learning_rate": 0.0002, "epoch": 1.0552763819095476, "step": 1260}, {"loss": 1.7334, "grad_norm": 0.3129824101924896, "learning_rate": 0.0002, "epoch": 1.0636515912897822, "step": 1270}, {"loss": 1.8068, "grad_norm": 0.3620055019855499, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1280}, {"loss": 1.7823, "grad_norm": 0.3480982184410095, "learning_rate": 0.0002, "epoch": 1.0804020100502512, "step": 1290}, {"loss": 1.7081, "grad_norm": 0.344424843788147, "learning_rate": 0.0002, "epoch": 1.0887772194304857, "step": 1300}, {"loss": 1.7366, "grad_norm": 0.3480122685432434, "learning_rate": 0.0002, "epoch": 1.0971524288107202, "step": 1310}, {"loss": 1.7029, "grad_norm": 0.323662132024765, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1320}, {"loss": 1.7517, "grad_norm": 0.35440102219581604, "learning_rate": 0.0002, "epoch": 1.1139028475711892, "step": 1330}, {"loss": 1.7573, "grad_norm": 0.3342263698577881, "learning_rate": 0.0002, "epoch": 1.1222780569514237, "step": 1340}, {"loss": 1.7134, "grad_norm": 0.35705259442329407, "learning_rate": 0.0002, "epoch": 1.1306532663316582, "step": 1350}, {"loss": 1.64, "grad_norm": 0.38021907210350037, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1360}, {"loss": 1.66, "grad_norm": 0.34918731451034546, "learning_rate": 0.0002, "epoch": 1.1474036850921272, "step": 1370}, {"loss": 1.7628, "grad_norm": 0.371868371963501, "learning_rate": 0.0002, "epoch": 1.1557788944723617, "step": 1380}, {"loss": 1.725, "grad_norm": 0.38413912057876587, "learning_rate": 0.0002, "epoch": 1.1641541038525962, "step": 1390}, {"loss": 1.6948, "grad_norm": 0.3898005187511444, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1400}, {"loss": 1.8105, "grad_norm": 0.3726498484611511, "learning_rate": 0.0002, "epoch": 1.1809045226130652, "step": 1410}, {"loss": 1.7379, "grad_norm": 0.3532905876636505, "learning_rate": 0.0002, "epoch": 1.1892797319932997, "step": 1420}, {"loss": 1.6699, "grad_norm": 0.338127464056015, "learning_rate": 0.0002, "epoch": 1.1976549413735342, "step": 1430}, {"loss": 1.871, "grad_norm": 0.3472749888896942, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1440}, {"loss": 1.7092, "grad_norm": 0.3523476719856262, "learning_rate": 0.0002, "epoch": 1.2144053601340032, "step": 1450}, {"loss": 1.7329, "grad_norm": 0.42986124753952026, "learning_rate": 0.0002, "epoch": 1.2227805695142377, "step": 1460}, {"loss": 1.7459, "grad_norm": 0.38195517659187317, "learning_rate": 0.0002, "epoch": 1.2311557788944723, "step": 1470}, {"loss": 1.7539, "grad_norm": 0.31665122509002686, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1480}, {"loss": 1.7224, "grad_norm": 0.3539541959762573, "learning_rate": 0.0002, "epoch": 1.2479061976549413, "step": 1490}, {"loss": 1.7655, "grad_norm": 0.40162816643714905, "learning_rate": 0.0002, "epoch": 1.2562814070351758, "step": 1500}, {"loss": 1.702, "grad_norm": 0.34727150201797485, "learning_rate": 0.0002, "epoch": 1.2646566164154103, "step": 1510}, {"loss": 1.7804, "grad_norm": 0.3364993929862976, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1520}, {"loss": 1.8063, "grad_norm": 0.323483943939209, "learning_rate": 0.0002, "epoch": 1.2814070351758793, "step": 1530}, {"loss": 1.7622, "grad_norm": 0.4114733934402466, "learning_rate": 0.0002, "epoch": 1.2897822445561138, "step": 1540}, {"loss": 1.6525, "grad_norm": 0.37476620078086853, "learning_rate": 0.0002, "epoch": 1.2981574539363483, "step": 1550}, {"loss": 1.7225, "grad_norm": 0.4216269552707672, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1560}, {"loss": 1.6995, "grad_norm": 0.3204927444458008, "learning_rate": 0.0002, "epoch": 1.3149078726968173, "step": 1570}, {"loss": 1.7132, "grad_norm": 0.36916354298591614, "learning_rate": 0.0002, "epoch": 1.3232830820770518, "step": 1580}, {"loss": 1.7383, "grad_norm": 0.3755691647529602, "learning_rate": 0.0002, "epoch": 1.3316582914572863, "step": 1590}, {"loss": 1.7351, "grad_norm": 0.3688889443874359, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 1600}, {"loss": 1.7664, "grad_norm": 0.34306398034095764, "learning_rate": 0.0002, "epoch": 1.3484087102177553, "step": 1610}, {"loss": 1.6943, "grad_norm": 0.3651525676250458, "learning_rate": 0.0002, "epoch": 1.3567839195979898, "step": 1620}, {"loss": 1.7206, "grad_norm": 0.3461526036262512, "learning_rate": 0.0002, "epoch": 1.3651591289782243, "step": 1630}, {"loss": 1.728, "grad_norm": 0.37959185242652893, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 1640}, {"loss": 1.746, "grad_norm": 0.4005356431007385, "learning_rate": 0.0002, "epoch": 1.3819095477386933, "step": 1650}, {"loss": 1.694, "grad_norm": 0.3537434935569763, "learning_rate": 0.0002, "epoch": 1.3902847571189278, "step": 1660}, {"loss": 1.6679, "grad_norm": 0.38220855593681335, "learning_rate": 0.0002, "epoch": 1.3986599664991624, "step": 1670}, {"loss": 1.7721, "grad_norm": 0.3573434352874756, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 1680}, {"loss": 1.6983, "grad_norm": 0.40028059482574463, "learning_rate": 0.0002, "epoch": 1.4154103852596314, "step": 1690}, {"loss": 1.7049, "grad_norm": 0.3953610360622406, "learning_rate": 0.0002, "epoch": 1.4237855946398659, "step": 1700}, {"loss": 1.7126, "grad_norm": 0.39524543285369873, "learning_rate": 0.0002, "epoch": 1.4321608040201004, "step": 1710}, {"loss": 1.8319, "grad_norm": 0.37721359729766846, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 1720}, {"loss": 1.7387, "grad_norm": 0.4220093786716461, "learning_rate": 0.0002, "epoch": 1.4489112227805694, "step": 1730}, {"loss": 1.7495, "grad_norm": 0.3876369595527649, "learning_rate": 0.0002, "epoch": 1.457286432160804, "step": 1740}, {"loss": 1.6859, "grad_norm": 0.3774619400501251, "learning_rate": 0.0002, "epoch": 1.4656616415410384, "step": 1750}, {"loss": 1.7223, "grad_norm": 0.3608052432537079, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 1760}, {"loss": 1.6746, "grad_norm": 0.32083916664123535, "learning_rate": 0.0002, "epoch": 1.4824120603015074, "step": 1770}, {"loss": 1.716, "grad_norm": 0.32290884852409363, "learning_rate": 0.0002, "epoch": 1.490787269681742, "step": 1780}, {"loss": 1.7648, "grad_norm": 0.3537974953651428, "learning_rate": 0.0002, "epoch": 1.4991624790619764, "step": 1790}, {"loss": 1.6784, "grad_norm": 0.36576104164123535, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 1800}, {"loss": 1.6818, "grad_norm": 0.3336752653121948, "learning_rate": 0.0002, "epoch": 1.5159128978224454, "step": 1810}, {"loss": 1.7425, "grad_norm": 0.3551652431488037, "learning_rate": 0.0002, "epoch": 1.52428810720268, "step": 1820}, {"loss": 1.6997, "grad_norm": 0.43313586711883545, "learning_rate": 0.0002, "epoch": 1.5326633165829144, "step": 1830}, {"loss": 1.7358, "grad_norm": 0.39160311222076416, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 1840}, {"loss": 1.7709, "grad_norm": 0.38758179545402527, "learning_rate": 0.0002, "epoch": 1.5494137353433834, "step": 1850}, {"loss": 1.7768, "grad_norm": 0.3658832013607025, "learning_rate": 0.0002, "epoch": 1.557788944723618, "step": 1860}, {"loss": 1.7486, "grad_norm": 0.375372052192688, "learning_rate": 0.0002, "epoch": 1.5661641541038525, "step": 1870}, {"loss": 1.6555, "grad_norm": 0.3586942255496979, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 1880}, {"loss": 1.7314, "grad_norm": 0.3626467287540436, "learning_rate": 0.0002, "epoch": 1.5829145728643215, "step": 1890}, {"loss": 1.7943, "grad_norm": 0.4199363589286804, "learning_rate": 0.0002, "epoch": 1.591289782244556, "step": 1900}, {"loss": 1.6551, "grad_norm": 0.35646331310272217, "learning_rate": 0.0002, "epoch": 1.5996649916247905, "step": 1910}, {"loss": 1.7125, "grad_norm": 0.3465106189250946, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 1920}, {"loss": 1.8507, "grad_norm": 0.43392884731292725, "learning_rate": 0.0002, "epoch": 1.6164154103852595, "step": 1930}, {"loss": 1.7009, "grad_norm": 0.39187198877334595, "learning_rate": 0.0002, "epoch": 1.624790619765494, "step": 1940}, {"loss": 1.7202, "grad_norm": 0.3685080409049988, "learning_rate": 0.0002, "epoch": 1.6331658291457285, "step": 1950}, {"loss": 1.6607, "grad_norm": 0.4044491946697235, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 1960}, {"loss": 1.7234, "grad_norm": 0.4388049244880676, "learning_rate": 0.0002, "epoch": 1.6499162479061975, "step": 1970}, {"loss": 1.7178, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.658291457286432, "step": 1980}, {"loss": 1.75, "grad_norm": 0.3501148521900177, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1990}, {"loss": 1.7057, "grad_norm": 0.3751881718635559, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2000}, {"loss": 1.7209, "grad_norm": 0.3902788460254669, "learning_rate": 0.0002, "epoch": 1.6834170854271355, "step": 2010}, {"loss": 1.8517, "grad_norm": 0.39642134308815, "learning_rate": 0.0002, "epoch": 1.69179229480737, "step": 2020}, {"loss": 1.6623, "grad_norm": 0.35721203684806824, "learning_rate": 0.0002, "epoch": 1.7001675041876045, "step": 2030}, {"loss": 1.6988, "grad_norm": 0.360419899225235, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2040}, {"loss": 1.691, "grad_norm": 0.3755600154399872, "learning_rate": 0.0002, "epoch": 1.7169179229480735, "step": 2050}, {"loss": 1.6726, "grad_norm": 0.3939184844493866, "learning_rate": 0.0002, "epoch": 1.725293132328308, "step": 2060}, {"loss": 1.7326, "grad_norm": 0.33955490589141846, "learning_rate": 0.0002, "epoch": 1.7336683417085426, "step": 2070}, {"loss": 1.6794, "grad_norm": 0.35501939058303833, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2080}, {"loss": 1.7312, "grad_norm": 0.38298022747039795, "learning_rate": 0.0002, "epoch": 1.7504187604690116, "step": 2090}, {"loss": 1.6602, "grad_norm": 0.3472785949707031, "learning_rate": 0.0002, "epoch": 1.758793969849246, "step": 2100}, {"loss": 1.6671, "grad_norm": 0.3620430827140808, "learning_rate": 0.0002, "epoch": 1.7671691792294806, "step": 2110}, {"loss": 1.671, "grad_norm": 0.3795909881591797, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2120}, {"loss": 1.7193, "grad_norm": 0.3662523925304413, "learning_rate": 0.0002, "epoch": 1.7839195979899496, "step": 2130}, {"loss": 1.7764, "grad_norm": 0.4113886058330536, "learning_rate": 0.0002, "epoch": 1.792294807370184, "step": 2140}, {"loss": 1.6681, "grad_norm": 0.3765672743320465, "learning_rate": 0.0002, "epoch": 1.8006700167504186, "step": 2150}, {"loss": 1.7481, "grad_norm": 0.41623714566230774, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2160}, {"loss": 1.712, "grad_norm": 0.3724099099636078, "learning_rate": 0.0002, "epoch": 1.8174204355108876, "step": 2170}, {"loss": 1.6912, "grad_norm": 0.3990779221057892, "learning_rate": 0.0002, "epoch": 1.8257956448911221, "step": 2180}, {"loss": 1.7361, "grad_norm": 0.3677702844142914, "learning_rate": 0.0002, "epoch": 1.8341708542713566, "step": 2190}, {"loss": 1.6705, "grad_norm": 0.3944959342479706, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2200}, {"loss": 1.7619, "grad_norm": 0.3413957357406616, "learning_rate": 0.0002, "epoch": 1.8509212730318256, "step": 2210}, {"loss": 1.7069, "grad_norm": 0.40136098861694336, "learning_rate": 0.0002, "epoch": 1.8592964824120601, "step": 2220}, {"loss": 1.6865, "grad_norm": 0.3496319055557251, "learning_rate": 0.0002, "epoch": 1.8676716917922946, "step": 2230}, {"loss": 1.6906, "grad_norm": 0.3759860694408417, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2240}, {"loss": 1.8394, "grad_norm": 0.43556007742881775, "learning_rate": 0.0002, "epoch": 1.8844221105527639, "step": 2250}, {"loss": 1.66, "grad_norm": 0.3864828944206238, "learning_rate": 0.0002, "epoch": 1.8927973199329984, "step": 2260}, {"loss": 1.6502, "grad_norm": 0.396930456161499, "learning_rate": 0.0002, "epoch": 1.9011725293132329, "step": 2270}, {"loss": 1.838, "grad_norm": 0.37667879462242126, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2280}, {"loss": 1.7315, "grad_norm": 0.3539164066314697, "learning_rate": 0.0002, "epoch": 1.917922948073702, "step": 2290}, {"loss": 1.7589, "grad_norm": 0.40542101860046387, "learning_rate": 0.0002, "epoch": 1.9262981574539364, "step": 2300}, {"loss": 1.6795, "grad_norm": 0.37341606616973877, "learning_rate": 0.0002, "epoch": 1.934673366834171, "step": 2310}, {"loss": 1.7058, "grad_norm": 0.4011504352092743, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2320}, {"loss": 1.688, "grad_norm": 0.37934592366218567, "learning_rate": 0.0002, "epoch": 1.95142378559464, "step": 2330}, {"loss": 1.6699, "grad_norm": 0.32745009660720825, "learning_rate": 0.0002, "epoch": 1.9597989949748744, "step": 2340}, {"loss": 1.7673, "grad_norm": 0.38347750902175903, "learning_rate": 0.0002, "epoch": 1.968174204355109, "step": 2350}, {"loss": 1.7116, "grad_norm": 0.3945120871067047, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2360}, {"loss": 1.7559, "grad_norm": 0.4034058749675751, "learning_rate": 0.0002, "epoch": 1.984924623115578, "step": 2370}, {"loss": 1.7254, "grad_norm": 0.3546718955039978, "learning_rate": 0.0002, "epoch": 1.9932998324958124, "step": 2380}, {"eval_loss": 1.8061236143112183, "eval_runtime": 38.2113, "eval_samples_per_second": 13.478, "eval_steps_per_second": 1.701, "epoch": 2.0, "step": 2388}, {"loss": 1.7203, "grad_norm": 0.35184019804000854, "learning_rate": 0.0002, "epoch": 2.0016750418760467, "step": 2390}, {"loss": 1.6124, "grad_norm": 0.40416669845581055, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 2400}, {"loss": 1.6092, "grad_norm": 0.3824569880962372, "learning_rate": 0.0002, "epoch": 2.0184254606365157, "step": 2410}, {"loss": 1.641, "grad_norm": 0.42036163806915283, "learning_rate": 0.0002, "epoch": 2.0268006700167502, "step": 2420}, {"loss": 1.6176, "grad_norm": 0.40417996048927307, "learning_rate": 0.0002, "epoch": 2.0351758793969847, "step": 2430}, {"loss": 1.643, "grad_norm": 0.45298922061920166, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 2440}, {"loss": 1.653, "grad_norm": 0.48289841413497925, "learning_rate": 0.0002, "epoch": 2.0519262981574538, "step": 2450}, {"loss": 1.5275, "grad_norm": 0.43702399730682373, "learning_rate": 0.0002, "epoch": 2.0603015075376883, "step": 2460}, {"loss": 1.5825, "grad_norm": 0.49487054347991943, "learning_rate": 0.0002, "epoch": 2.0686767169179228, "step": 2470}, {"loss": 1.6552, "grad_norm": 0.40030500292778015, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 2480}, {"loss": 1.614, "grad_norm": 0.4664880037307739, "learning_rate": 0.0002, "epoch": 2.0854271356783918, "step": 2490}, {"loss": 1.6589, "grad_norm": 0.4111400842666626, "learning_rate": 0.0002, "epoch": 2.0938023450586263, "step": 2500}, {"loss": 1.5788, "grad_norm": 0.4155750572681427, "learning_rate": 0.0002, "epoch": 2.102177554438861, "step": 2510}, {"loss": 1.598, "grad_norm": 0.39257505536079407, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 2520}, {"loss": 1.65, "grad_norm": 0.4156777560710907, "learning_rate": 0.0002, "epoch": 2.11892797319933, "step": 2530}, {"loss": 1.6695, "grad_norm": 0.4025181233882904, "learning_rate": 0.0002, "epoch": 2.1273031825795643, "step": 2540}, {"loss": 1.6471, "grad_norm": 0.42347562313079834, "learning_rate": 0.0002, "epoch": 2.135678391959799, "step": 2550}, {"loss": 1.6014, "grad_norm": 0.47068294882774353, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 2560}, {"loss": 1.6468, "grad_norm": 0.44081777334213257, "learning_rate": 0.0002, "epoch": 2.152428810720268, "step": 2570}, {"loss": 1.641, "grad_norm": 0.44823798537254333, "learning_rate": 0.0002, "epoch": 2.1608040201005023, "step": 2580}, {"loss": 1.6287, "grad_norm": 0.40486326813697815, "learning_rate": 0.0002, "epoch": 2.169179229480737, "step": 2590}, {"loss": 1.6198, "grad_norm": 0.454236775636673, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 2600}, {"loss": 1.5885, "grad_norm": 0.42555344104766846, "learning_rate": 0.0002, "epoch": 2.185929648241206, "step": 2610}, {"loss": 1.6348, "grad_norm": 0.5607381463050842, "learning_rate": 0.0002, "epoch": 2.1943048576214403, "step": 2620}, {"loss": 1.6343, "grad_norm": 0.4095611870288849, "learning_rate": 0.0002, "epoch": 2.202680067001675, "step": 2630}, {"loss": 1.5584, "grad_norm": 0.419342577457428, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 2640}, {"loss": 1.5425, "grad_norm": 0.48541849851608276, "learning_rate": 0.0002, "epoch": 2.219430485762144, "step": 2650}, {"loss": 1.6233, "grad_norm": 0.4365246891975403, "learning_rate": 0.0002, "epoch": 2.2278056951423784, "step": 2660}, {"loss": 1.6886, "grad_norm": 0.46417000889778137, "learning_rate": 0.0002, "epoch": 2.236180904522613, "step": 2670}, {"loss": 1.6345, "grad_norm": 0.5034580230712891, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 2680}, {"loss": 1.5992, "grad_norm": 0.44852879643440247, "learning_rate": 0.0002, "epoch": 2.2529313232830823, "step": 2690}, {"loss": 1.6152, "grad_norm": 0.43886998295783997, "learning_rate": 0.0002, "epoch": 2.2613065326633164, "step": 2700}, {"loss": 1.6533, "grad_norm": 0.45762625336647034, "learning_rate": 0.0002, "epoch": 2.2696817420435513, "step": 2710}, {"loss": 1.5889, "grad_norm": 0.39429017901420593, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 2720}, {"loss": 1.6419, "grad_norm": 0.4420442581176758, "learning_rate": 0.0002, "epoch": 2.2864321608040203, "step": 2730}, {"loss": 1.6126, "grad_norm": 0.4327794015407562, "learning_rate": 0.0002, "epoch": 2.2948073701842544, "step": 2740}, {"loss": 1.6405, "grad_norm": 0.4303780198097229, "learning_rate": 0.0002, "epoch": 2.3031825795644894, "step": 2750}, {"loss": 1.6362, "grad_norm": 0.41379377245903015, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 2760}, {"loss": 1.6744, "grad_norm": 0.4821205735206604, "learning_rate": 0.0002, "epoch": 2.3199329983249584, "step": 2770}, {"loss": 1.6694, "grad_norm": 0.46232181787490845, "learning_rate": 0.0002, "epoch": 2.3283082077051924, "step": 2780}, {"loss": 1.6341, "grad_norm": 0.44937554001808167, "learning_rate": 0.0002, "epoch": 2.3366834170854274, "step": 2790}, {"loss": 1.6556, "grad_norm": 0.443250447511673, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 2800}, {"loss": 1.6874, "grad_norm": 0.4687805473804474, "learning_rate": 0.0002, "epoch": 2.3534338358458964, "step": 2810}, {"loss": 1.6445, "grad_norm": 0.435031920671463, "learning_rate": 0.0002, "epoch": 2.3618090452261304, "step": 2820}, {"loss": 1.6335, "grad_norm": 0.4949858784675598, "learning_rate": 0.0002, "epoch": 2.3701842546063654, "step": 2830}, {"loss": 1.6803, "grad_norm": 0.46349018812179565, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 2840}, {"loss": 1.6586, "grad_norm": 0.46377238631248474, "learning_rate": 0.0002, "epoch": 2.3869346733668344, "step": 2850}, {"loss": 1.5384, "grad_norm": 0.6111940741539001, "learning_rate": 0.0002, "epoch": 2.3953098827470685, "step": 2860}, {"loss": 1.6132, "grad_norm": 0.45090532302856445, "learning_rate": 0.0002, "epoch": 2.4036850921273034, "step": 2870}, {"loss": 1.6047, "grad_norm": 0.4762120842933655, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 2880}, {"loss": 1.6997, "grad_norm": 0.4397919774055481, "learning_rate": 0.0002, "epoch": 2.4204355108877724, "step": 2890}, {"loss": 1.6369, "grad_norm": 0.4765152335166931, "learning_rate": 0.0002, "epoch": 2.4288107202680065, "step": 2900}, {"loss": 1.5982, "grad_norm": 0.4347304403781891, "learning_rate": 0.0002, "epoch": 2.4371859296482414, "step": 2910}, {"loss": 1.6409, "grad_norm": 0.3918324410915375, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 2920}, {"loss": 1.5354, "grad_norm": 0.43932855129241943, "learning_rate": 0.0002, "epoch": 2.4539363484087104, "step": 2930}, {"loss": 1.6283, "grad_norm": 0.46946918964385986, "learning_rate": 0.0002, "epoch": 2.4623115577889445, "step": 2940}, {"loss": 1.6622, "grad_norm": 0.45169174671173096, "learning_rate": 0.0002, "epoch": 2.4706867671691795, "step": 2950}, {"loss": 1.6386, "grad_norm": 0.43488186597824097, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 2960}, {"loss": 1.6187, "grad_norm": 0.42297765612602234, "learning_rate": 0.0002, "epoch": 2.4874371859296485, "step": 2970}, {"loss": 1.5708, "grad_norm": 0.4546392560005188, "learning_rate": 0.0002, "epoch": 2.4958123953098825, "step": 2980}, {"loss": 1.5944, "grad_norm": 0.4236692488193512, "learning_rate": 0.0002, "epoch": 2.5041876046901175, "step": 2990}, {"loss": 1.6927, "grad_norm": 0.46421024203300476, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3000}, {"loss": 1.6686, "grad_norm": 0.5040220618247986, "learning_rate": 0.0002, "epoch": 2.5209380234505865, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4596138894557953, "learning_rate": 0.0002, "epoch": 2.5293132328308205, "step": 3020}, {"loss": 1.5936, "grad_norm": 0.4410228729248047, "learning_rate": 0.0002, "epoch": 2.5376884422110555, "step": 3030}, {"loss": 1.6336, "grad_norm": 0.553693413734436, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3040}, {"loss": 1.6377, "grad_norm": 0.41298043727874756, "learning_rate": 0.0002, "epoch": 2.5544388609715245, "step": 3050}, {"loss": 1.7196, "grad_norm": 0.4894513487815857, "learning_rate": 0.0002, "epoch": 2.5628140703517586, "step": 3060}, {"loss": 1.6106, "grad_norm": 0.5525603294372559, "learning_rate": 0.0002, "epoch": 2.5711892797319935, "step": 3070}, {"loss": 1.6089, "grad_norm": 0.5043630003929138, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3080}, {"loss": 1.5641, "grad_norm": 0.4690920412540436, "learning_rate": 0.0002, "epoch": 2.5879396984924625, "step": 3090}, {"loss": 1.6364, "grad_norm": 0.4358677566051483, "learning_rate": 0.0002, "epoch": 2.5963149078726966, "step": 3100}, {"loss": 1.6328, "grad_norm": 0.4621894061565399, "learning_rate": 0.0002, "epoch": 2.6046901172529315, "step": 3110}, {"loss": 1.7426, "grad_norm": 0.4639507532119751, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3120}, {"loss": 1.6492, "grad_norm": 0.45161309838294983, "learning_rate": 0.0002, "epoch": 2.6214405360134005, "step": 3130}, {"loss": 1.6221, "grad_norm": 0.49179261922836304, "learning_rate": 0.0002, "epoch": 2.6298157453936346, "step": 3140}, {"loss": 1.663, "grad_norm": 0.4739720821380615, "learning_rate": 0.0002, "epoch": 2.6381909547738696, "step": 3150}, {"loss": 1.616, "grad_norm": 0.468252956867218, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3160}, {"loss": 1.705, "grad_norm": 0.44691553711891174, "learning_rate": 0.0002, "epoch": 2.6549413735343386, "step": 3170}, {"loss": 1.6558, "grad_norm": 0.47537046670913696, "learning_rate": 0.0002, "epoch": 2.6633165829145726, "step": 3180}, {"loss": 1.6755, "grad_norm": 0.4445202052593231, "learning_rate": 0.0002, "epoch": 2.6716917922948076, "step": 3190}, {"loss": 1.6522, "grad_norm": 0.46785518527030945, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 3200}, {"loss": 1.6711, "grad_norm": 0.4807088077068329, "learning_rate": 0.0002, "epoch": 2.6884422110552766, "step": 3210}, {"loss": 1.6385, "grad_norm": 0.4547516703605652, "learning_rate": 0.0002, "epoch": 2.6968174204355106, "step": 3220}, {"loss": 1.6084, "grad_norm": 0.5200821161270142, "learning_rate": 0.0002, "epoch": 2.7051926298157456, "step": 3230}, {"loss": 1.6434, "grad_norm": 0.4915551245212555, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 3240}, {"loss": 1.6146, "grad_norm": 0.4324817955493927, "learning_rate": 0.0002, "epoch": 2.7219430485762146, "step": 3250}, {"loss": 1.6154, "grad_norm": 0.6290464997291565, "learning_rate": 0.0002, "epoch": 2.7303182579564487, "step": 3260}, {"loss": 1.611, "grad_norm": 0.42255541682243347, "learning_rate": 0.0002, "epoch": 2.7386934673366836, "step": 3270}, {"loss": 1.6345, "grad_norm": 0.47089505195617676, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 3280}, {"loss": 1.6357, "grad_norm": 0.4492960572242737, "learning_rate": 0.0002, "epoch": 2.7554438860971526, "step": 3290}, {"loss": 1.652, "grad_norm": 0.4711938202381134, "learning_rate": 0.0002, "epoch": 2.7638190954773867, "step": 3300}, {"loss": 1.6107, "grad_norm": 0.4635316729545593, "learning_rate": 0.0002, "epoch": 2.7721943048576216, "step": 3310}, {"loss": 1.6044, "grad_norm": 0.4207742512226105, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 3320}, {"loss": 1.6163, "grad_norm": 0.5545504093170166, "learning_rate": 0.0002, "epoch": 2.7889447236180906, "step": 3330}, {"loss": 1.6642, "grad_norm": 0.46976953744888306, "learning_rate": 0.0002, "epoch": 2.7973199329983247, "step": 3340}, {"loss": 1.6879, "grad_norm": 0.4805937111377716, "learning_rate": 0.0002, "epoch": 2.8056951423785597, "step": 3350}, {"loss": 1.6185, "grad_norm": 0.4986467659473419, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 3360}, {"loss": 1.6125, "grad_norm": 0.44702932238578796, "learning_rate": 0.0002, "epoch": 2.8224455611390287, "step": 3370}, {"loss": 1.6318, "grad_norm": 0.4698854088783264, "learning_rate": 0.0002, "epoch": 2.8308207705192627, "step": 3380}, {"loss": 1.6468, "grad_norm": 0.5756528377532959, "learning_rate": 0.0002, "epoch": 2.8391959798994977, "step": 3390}, {"loss": 1.6783, "grad_norm": 0.4266531765460968, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 3400}, {"loss": 1.6351, "grad_norm": 0.5342442989349365, "learning_rate": 0.0002, "epoch": 2.8559463986599667, "step": 3410}, {"loss": 1.659, "grad_norm": 0.47210443019866943, "learning_rate": 0.0002, "epoch": 2.8643216080402008, "step": 3420}, {"loss": 1.6157, "grad_norm": 0.4491795599460602, "learning_rate": 0.0002, "epoch": 2.8726968174204357, "step": 3430}, {"loss": 1.6179, "grad_norm": 0.5387647151947021, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 3440}, {"loss": 1.6415, "grad_norm": 0.5059208273887634, "learning_rate": 0.0002, "epoch": 2.8894472361809047, "step": 3450}, {"loss": 1.6577, "grad_norm": 0.472605437040329, "learning_rate": 0.0002, "epoch": 2.8978224455611388, "step": 3460}, {"loss": 1.6831, "grad_norm": 0.499795138835907, "learning_rate": 0.0002, "epoch": 2.9061976549413737, "step": 3470}, {"loss": 1.6198, "grad_norm": 0.4887969493865967, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4670022130012512, "learning_rate": 0.0002, "epoch": 2.9229480737018427, "step": 3490}, {"loss": 1.6355, "grad_norm": 0.4475444555282593, "learning_rate": 0.0002, "epoch": 2.931323283082077, "step": 3500}, {"loss": 1.6669, "grad_norm": 0.39244669675827026, "learning_rate": 0.0002, "epoch": 2.9396984924623117, "step": 3510}, {"loss": 1.6094, "grad_norm": 0.4905056059360504, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 3520}, {"loss": 1.5774, "grad_norm": 0.4395551085472107, "learning_rate": 0.0002, "epoch": 2.9564489112227808, "step": 3530}, {"loss": 1.6047, "grad_norm": 0.4693661034107208, "learning_rate": 0.0002, "epoch": 2.964824120603015, "step": 3540}, {"loss": 1.648, "grad_norm": 0.473781943321228, "learning_rate": 0.0002, "epoch": 2.9731993299832498, "step": 3550}, {"loss": 1.7056, "grad_norm": 0.4374050796031952, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 3560}, {"loss": 1.6816, "grad_norm": 0.46144190430641174, "learning_rate": 0.0002, "epoch": 2.9899497487437188, "step": 3570}, {"loss": 1.5454, "grad_norm": 0.43887680768966675, "learning_rate": 0.0002, "epoch": 2.998324958123953, "step": 3580}, {"eval_loss": 1.8283122777938843, "eval_runtime": 38.023, "eval_samples_per_second": 13.544, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 3582}, {"loss": 1.5874, "grad_norm": 0.6784713268280029, "learning_rate": 0.0002, "epoch": 3.006700167504188, "step": 3590}, {"loss": 1.5813, "grad_norm": 0.5783940553665161, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 3600}, {"loss": 1.4769, "grad_norm": 0.5408937335014343, "learning_rate": 0.0002, "epoch": 3.023450586264657, "step": 3610}, {"loss": 1.526, "grad_norm": 0.5229013562202454, "learning_rate": 0.0002, "epoch": 3.0318257956448913, "step": 3620}, {"loss": 1.4835, "grad_norm": 0.49160143733024597, "learning_rate": 0.0002, "epoch": 3.040201005025126, "step": 3630}, {"loss": 1.5398, "grad_norm": 0.6563201546669006, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 3640}, {"loss": 1.448, "grad_norm": 0.5686020851135254, "learning_rate": 0.0002, "epoch": 3.056951423785595, "step": 3650}, {"loss": 1.4541, "grad_norm": 0.5774043202400208, "learning_rate": 0.0002, "epoch": 3.0653266331658293, "step": 3660}, {"loss": 1.4734, "grad_norm": 0.6106171011924744, "learning_rate": 0.0002, "epoch": 3.073701842546064, "step": 3670}, {"loss": 1.4961, "grad_norm": 0.517433226108551, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 3680}, {"loss": 1.4961, "grad_norm": 0.5681702494621277, "learning_rate": 0.0002, "epoch": 3.090452261306533, "step": 3690}, {"loss": 1.4731, "grad_norm": 0.5769233107566833, "learning_rate": 0.0002, "epoch": 3.0988274706867673, "step": 3700}, {"loss": 1.4836, "grad_norm": 0.5657462477684021, "learning_rate": 0.0002, "epoch": 3.107202680067002, "step": 3710}, {"loss": 1.4526, "grad_norm": 0.6035246253013611, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 3720}, {"loss": 1.5102, "grad_norm": 0.7286643385887146, "learning_rate": 0.0002, "epoch": 3.123953098827471, "step": 3730}, {"loss": 1.4444, "grad_norm": 0.5121201872825623, "learning_rate": 0.0002, "epoch": 3.1323283082077054, "step": 3740}, {"loss": 1.565, "grad_norm": 0.5074213147163391, "learning_rate": 0.0002, "epoch": 3.14070351758794, "step": 3750}, {"loss": 1.4729, "grad_norm": 0.57481849193573, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 3760}, {"loss": 1.4765, "grad_norm": 0.6326663494110107, "learning_rate": 0.0002, "epoch": 3.157453936348409, "step": 3770}, {"loss": 1.4888, "grad_norm": 0.6039315462112427, "learning_rate": 0.0002, "epoch": 3.1658291457286434, "step": 3780}, {"loss": 1.5084, "grad_norm": 0.6936715245246887, "learning_rate": 0.0002, "epoch": 3.174204355108878, "step": 3790}, {"loss": 1.4879, "grad_norm": 0.6516796946525574, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 3800}, {"loss": 1.578, "grad_norm": 0.6140730977058411, "learning_rate": 0.0002, "epoch": 3.190954773869347, "step": 3810}, {"loss": 1.5101, "grad_norm": 0.631328284740448, "learning_rate": 0.0002, "epoch": 3.1993299832495814, "step": 3820}, {"loss": 1.4844, "grad_norm": 0.6265402436256409, "learning_rate": 0.0002, "epoch": 3.207705192629816, "step": 3830}, {"loss": 1.5332, "grad_norm": 0.6649428606033325, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 3840}, {"loss": 1.5231, "grad_norm": 0.5329259634017944, "learning_rate": 0.0002, "epoch": 3.224455611390285, "step": 3850}, {"loss": 1.5714, "grad_norm": 0.6008304953575134, "learning_rate": 0.0002, "epoch": 3.2328308207705194, "step": 3860}, {"loss": 1.5214, "grad_norm": 0.5918582081794739, "learning_rate": 0.0002, "epoch": 3.241206030150754, "step": 3870}, {"loss": 1.571, "grad_norm": 0.643622100353241, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 3880}, {"loss": 1.5274, "grad_norm": 0.5517964363098145, "learning_rate": 0.0002, "epoch": 3.257956448911223, "step": 3890}, {"loss": 1.5458, "grad_norm": 0.6780755519866943, "learning_rate": 0.0002, "epoch": 3.2663316582914574, "step": 3900}, {"loss": 1.5743, "grad_norm": 0.6742202639579773, "learning_rate": 0.0002, "epoch": 3.274706867671692, "step": 3910}, {"loss": 1.5279, "grad_norm": 0.6228749752044678, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 3920}, {"loss": 1.4899, "grad_norm": 0.5836303234100342, "learning_rate": 0.0002, "epoch": 3.291457286432161, "step": 3930}, {"loss": 1.5445, "grad_norm": 0.6337724328041077, "learning_rate": 0.0002, "epoch": 3.2998324958123955, "step": 3940}, {"loss": 1.5618, "grad_norm": 0.6345084309577942, "learning_rate": 0.0002, "epoch": 3.30820770519263, "step": 3950}, {"loss": 1.4224, "grad_norm": 0.6125303506851196, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 3960}, {"loss": 1.5355, "grad_norm": 0.6259911060333252, "learning_rate": 0.0002, "epoch": 3.324958123953099, "step": 3970}, {"loss": 1.5427, "grad_norm": 0.645745575428009, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 3980}, {"loss": 1.5817, "grad_norm": 0.6666176915168762, "learning_rate": 0.0002, "epoch": 3.341708542713568, "step": 3990}, {"loss": 1.4998, "grad_norm": 0.59013831615448, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 4000}, {"loss": 1.4921, "grad_norm": 0.6604634523391724, "learning_rate": 0.0002, "epoch": 3.358458961474037, "step": 4010}, {"loss": 1.5076, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 3.3668341708542715, "step": 4020}, {"loss": 1.4801, "grad_norm": 0.515724778175354, "learning_rate": 0.0002, "epoch": 3.375209380234506, "step": 4030}, {"loss": 1.4932, "grad_norm": 0.681968092918396, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 4040}, {"loss": 1.5148, "grad_norm": 0.5978158116340637, "learning_rate": 0.0002, "epoch": 3.391959798994975, "step": 4050}, {"loss": 1.5449, "grad_norm": 0.6043432354927063, "learning_rate": 0.0002, "epoch": 3.4003350083752095, "step": 4060}, {"loss": 1.5021, "grad_norm": 0.5899770855903625, "learning_rate": 0.0002, "epoch": 3.408710217755444, "step": 4070}, {"loss": 1.5992, "grad_norm": 0.6014242172241211, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 4080}, {"loss": 1.4692, "grad_norm": 0.5944811105728149, "learning_rate": 0.0002, "epoch": 3.425460636515913, "step": 4090}, {"loss": 1.5877, "grad_norm": 0.6506822109222412, "learning_rate": 0.0002, "epoch": 3.4338358458961475, "step": 4100}, {"loss": 1.5144, "grad_norm": 0.6926528811454773, "learning_rate": 0.0002, "epoch": 3.442211055276382, "step": 4110}, {"loss": 1.5169, "grad_norm": 0.5646378993988037, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 4120}, {"loss": 1.5032, "grad_norm": 0.7233654856681824, "learning_rate": 0.0002, "epoch": 3.458961474036851, "step": 4130}, {"loss": 1.5161, "grad_norm": 0.6231815814971924, "learning_rate": 0.0002, "epoch": 3.4673366834170856, "step": 4140}, {"loss": 1.5349, "grad_norm": 0.6115689873695374, "learning_rate": 0.0002, "epoch": 3.47571189279732, "step": 4150}, {"loss": 1.4621, "grad_norm": 0.5812674760818481, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 4160}, {"loss": 1.5465, "grad_norm": 0.6099632978439331, "learning_rate": 0.0002, "epoch": 3.492462311557789, "step": 4170}, {"loss": 1.4795, "grad_norm": 0.6102647185325623, "learning_rate": 0.0002, "epoch": 3.5008375209380236, "step": 4180}, {"loss": 1.5305, "grad_norm": 0.6034680008888245, "learning_rate": 0.0002, "epoch": 3.509212730318258, "step": 4190}, {"loss": 1.5093, "grad_norm": 0.6281666159629822, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 4200}, {"loss": 1.4903, "grad_norm": 0.6245372295379639, "learning_rate": 0.0002, "epoch": 3.525963149078727, "step": 4210}, {"loss": 1.5098, "grad_norm": 0.5897293090820312, "learning_rate": 0.0002, "epoch": 3.5343383584589616, "step": 4220}, {"loss": 1.5991, "grad_norm": 0.601054847240448, "learning_rate": 0.0002, "epoch": 3.542713567839196, "step": 4230}, {"loss": 1.4974, "grad_norm": 0.7004473805427551, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 4240}, {"loss": 1.5993, "grad_norm": 0.6601553559303284, "learning_rate": 0.0002, "epoch": 3.559463986599665, "step": 4250}, {"loss": 1.4961, "grad_norm": 0.6112467050552368, "learning_rate": 0.0002, "epoch": 3.5678391959798996, "step": 4260}, {"loss": 1.4967, "grad_norm": 0.5902454853057861, "learning_rate": 0.0002, "epoch": 3.576214405360134, "step": 4270}, {"loss": 1.5659, "grad_norm": 0.5792450904846191, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 4280}, {"loss": 1.4664, "grad_norm": 0.5923888087272644, "learning_rate": 0.0002, "epoch": 3.592964824120603, "step": 4290}, {"loss": 1.5155, "grad_norm": 0.5869482159614563, "learning_rate": 0.0002, "epoch": 3.6013400335008376, "step": 4300}, {"loss": 1.5119, "grad_norm": 0.6372929811477661, "learning_rate": 0.0002, "epoch": 3.609715242881072, "step": 4310}, {"loss": 1.4977, "grad_norm": 0.6350686550140381, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 4320}, {"loss": 1.5226, "grad_norm": 0.571819007396698, "learning_rate": 0.0002, "epoch": 3.626465661641541, "step": 4330}, {"loss": 1.5414, "grad_norm": 0.592250645160675, "learning_rate": 0.0002, "epoch": 3.6348408710217757, "step": 4340}, {"loss": 1.4912, "grad_norm": 0.6110650897026062, "learning_rate": 0.0002, "epoch": 3.64321608040201, "step": 4350}, {"loss": 1.6089, "grad_norm": 0.6187081336975098, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 4360}, {"loss": 1.5345, "grad_norm": 0.6197671890258789, "learning_rate": 0.0002, "epoch": 3.659966499162479, "step": 4370}, {"loss": 1.4988, "grad_norm": 0.6050862669944763, "learning_rate": 0.0002, "epoch": 3.6683417085427137, "step": 4380}, {"loss": 1.4872, "grad_norm": 0.621265172958374, "learning_rate": 0.0002, "epoch": 3.676716917922948, "step": 4390}, {"loss": 1.6011, "grad_norm": 0.6552940011024475, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 4400}, {"loss": 1.4344, "grad_norm": 0.5638861060142517, "learning_rate": 0.0002, "epoch": 3.693467336683417, "step": 4410}, {"loss": 1.4985, "grad_norm": 0.6388863325119019, "learning_rate": 0.0002, "epoch": 3.7018425460636517, "step": 4420}, {"loss": 1.3696, "grad_norm": 0.6062559485435486, "learning_rate": 0.0002, "epoch": 3.710217755443886, "step": 4430}, {"loss": 1.5101, "grad_norm": 0.5800350308418274, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 4440}, {"loss": 1.5286, "grad_norm": 0.5954474210739136, "learning_rate": 0.0002, "epoch": 3.726968174204355, "step": 4450}, {"loss": 1.6133, "grad_norm": 0.5880125761032104, "learning_rate": 0.0002, "epoch": 3.7353433835845897, "step": 4460}, {"loss": 1.5055, "grad_norm": 0.5880921483039856, "learning_rate": 0.0002, "epoch": 3.7437185929648242, "step": 4470}, {"loss": 1.5728, "grad_norm": 0.5995073914527893, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 4480}, {"loss": 1.554, "grad_norm": 0.5958493947982788, "learning_rate": 0.0002, "epoch": 3.7604690117252932, "step": 4490}, {"loss": 1.5472, "grad_norm": 0.5694711804389954, "learning_rate": 0.0002, "epoch": 3.7688442211055277, "step": 4500}, {"loss": 1.5105, "grad_norm": 0.6175141930580139, "learning_rate": 0.0002, "epoch": 3.7772194304857623, "step": 4510}, {"loss": 1.5404, "grad_norm": 0.5541581511497498, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 4520}, {"loss": 1.5283, "grad_norm": 0.5986164808273315, "learning_rate": 0.0002, "epoch": 3.7939698492462313, "step": 4530}, {"loss": 1.4961, "grad_norm": 0.640072226524353, "learning_rate": 0.0002, "epoch": 3.8023450586264658, "step": 4540}, {"loss": 1.5297, "grad_norm": 0.5742579698562622, "learning_rate": 0.0002, "epoch": 3.8107202680067003, "step": 4550}, {"loss": 1.5591, "grad_norm": 0.6658656001091003, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 4560}, {"loss": 1.4992, "grad_norm": 0.5475369691848755, "learning_rate": 0.0002, "epoch": 3.8274706867671693, "step": 4570}, {"loss": 1.5966, "grad_norm": 0.613172173500061, "learning_rate": 0.0002, "epoch": 3.835845896147404, "step": 4580}, {"loss": 1.5594, "grad_norm": 0.590968132019043, "learning_rate": 0.0002, "epoch": 3.8442211055276383, "step": 4590}, {"loss": 1.5067, "grad_norm": 0.5865461826324463, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 4600}, {"loss": 1.5247, "grad_norm": 0.6815178990364075, "learning_rate": 0.0002, "epoch": 3.8609715242881073, "step": 4610}, {"loss": 1.5702, "grad_norm": 0.6551400423049927, "learning_rate": 0.0002, "epoch": 3.869346733668342, "step": 4620}, {"loss": 1.4891, "grad_norm": 0.6398897171020508, "learning_rate": 0.0002, "epoch": 3.8777219430485763, "step": 4630}, {"loss": 1.5353, "grad_norm": 0.6761762499809265, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 4640}, {"loss": 1.6071, "grad_norm": 0.6277294754981995, "learning_rate": 0.0002, "epoch": 3.8944723618090453, "step": 4650}, {"loss": 1.5605, "grad_norm": 0.6285301446914673, "learning_rate": 0.0002, "epoch": 3.90284757118928, "step": 4660}, {"loss": 1.5937, "grad_norm": 0.5416069626808167, "learning_rate": 0.0002, "epoch": 3.9112227805695143, "step": 4670}, {"loss": 1.5461, "grad_norm": 0.6314545273780823, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 4680}, {"loss": 1.4828, "grad_norm": 0.604479968547821, "learning_rate": 0.0002, "epoch": 3.9279731993299833, "step": 4690}, {"loss": 1.5186, "grad_norm": 0.5321660041809082, "learning_rate": 0.0002, "epoch": 3.936348408710218, "step": 4700}, {"loss": 1.4696, "grad_norm": 0.6632516980171204, "learning_rate": 0.0002, "epoch": 3.9447236180904524, "step": 4710}, {"loss": 1.519, "grad_norm": 0.5925896763801575, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 4720}, {"loss": 1.5716, "grad_norm": 0.6580308675765991, "learning_rate": 0.0002, "epoch": 3.9614740368509214, "step": 4730}, {"loss": 1.4462, "grad_norm": 0.5578170418739319, "learning_rate": 0.0002, "epoch": 3.969849246231156, "step": 4740}, {"loss": 1.5394, "grad_norm": 0.6216608285903931, "learning_rate": 0.0002, "epoch": 3.9782244556113904, "step": 4750}, {"loss": 1.5395, "grad_norm": 0.5693069696426392, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 4760}, {"loss": 1.5517, "grad_norm": 0.5353434681892395, "learning_rate": 0.0002, "epoch": 3.9949748743718594, "step": 4770}]} +{"epoch": 5.0, "step": 5970, "epoch_duration": 1332.369313955307, "total_accumulated_duration": 6631.200866699219, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}, {"eval_loss": 1.8088148832321167, "eval_runtime": 37.9609, "eval_samples_per_second": 13.567, "eval_steps_per_second": 1.712, "epoch": 1.0, "step": 1194}, {"loss": 1.7492, "grad_norm": 0.4256260097026825, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.35050156712532043, "learning_rate": 0.0002, "epoch": 1.0134003350083751, "step": 1210}, {"loss": 1.7422, "grad_norm": 0.34773948788642883, "learning_rate": 0.0002, "epoch": 1.0217755443886096, "step": 1220}, {"loss": 1.7803, "grad_norm": 0.35487470030784607, "learning_rate": 0.0002, "epoch": 1.0301507537688441, "step": 1230}, {"loss": 1.7095, "grad_norm": 0.37040361762046814, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1240}, {"loss": 1.7663, "grad_norm": 0.33740508556365967, "learning_rate": 0.0002, "epoch": 1.0469011725293131, "step": 1250}, {"loss": 1.7485, "grad_norm": 0.3962724506855011, "learning_rate": 0.0002, "epoch": 1.0552763819095476, "step": 1260}, {"loss": 1.7334, "grad_norm": 0.3129824101924896, "learning_rate": 0.0002, "epoch": 1.0636515912897822, "step": 1270}, {"loss": 1.8068, "grad_norm": 0.3620055019855499, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1280}, {"loss": 1.7823, "grad_norm": 0.3480982184410095, "learning_rate": 0.0002, "epoch": 1.0804020100502512, "step": 1290}, {"loss": 1.7081, "grad_norm": 0.344424843788147, "learning_rate": 0.0002, "epoch": 1.0887772194304857, "step": 1300}, {"loss": 1.7366, "grad_norm": 0.3480122685432434, "learning_rate": 0.0002, "epoch": 1.0971524288107202, "step": 1310}, {"loss": 1.7029, "grad_norm": 0.323662132024765, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1320}, {"loss": 1.7517, "grad_norm": 0.35440102219581604, "learning_rate": 0.0002, "epoch": 1.1139028475711892, "step": 1330}, {"loss": 1.7573, "grad_norm": 0.3342263698577881, "learning_rate": 0.0002, "epoch": 1.1222780569514237, "step": 1340}, {"loss": 1.7134, "grad_norm": 0.35705259442329407, "learning_rate": 0.0002, "epoch": 1.1306532663316582, "step": 1350}, {"loss": 1.64, "grad_norm": 0.38021907210350037, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1360}, {"loss": 1.66, "grad_norm": 0.34918731451034546, "learning_rate": 0.0002, "epoch": 1.1474036850921272, "step": 1370}, {"loss": 1.7628, "grad_norm": 0.371868371963501, "learning_rate": 0.0002, "epoch": 1.1557788944723617, "step": 1380}, {"loss": 1.725, "grad_norm": 0.38413912057876587, "learning_rate": 0.0002, "epoch": 1.1641541038525962, "step": 1390}, {"loss": 1.6948, "grad_norm": 0.3898005187511444, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1400}, {"loss": 1.8105, "grad_norm": 0.3726498484611511, "learning_rate": 0.0002, "epoch": 1.1809045226130652, "step": 1410}, {"loss": 1.7379, "grad_norm": 0.3532905876636505, "learning_rate": 0.0002, "epoch": 1.1892797319932997, "step": 1420}, {"loss": 1.6699, "grad_norm": 0.338127464056015, "learning_rate": 0.0002, "epoch": 1.1976549413735342, "step": 1430}, {"loss": 1.871, "grad_norm": 0.3472749888896942, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1440}, {"loss": 1.7092, "grad_norm": 0.3523476719856262, "learning_rate": 0.0002, "epoch": 1.2144053601340032, "step": 1450}, {"loss": 1.7329, "grad_norm": 0.42986124753952026, "learning_rate": 0.0002, "epoch": 1.2227805695142377, "step": 1460}, {"loss": 1.7459, "grad_norm": 0.38195517659187317, "learning_rate": 0.0002, "epoch": 1.2311557788944723, "step": 1470}, {"loss": 1.7539, "grad_norm": 0.31665122509002686, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1480}, {"loss": 1.7224, "grad_norm": 0.3539541959762573, "learning_rate": 0.0002, "epoch": 1.2479061976549413, "step": 1490}, {"loss": 1.7655, "grad_norm": 0.40162816643714905, "learning_rate": 0.0002, "epoch": 1.2562814070351758, "step": 1500}, {"loss": 1.702, "grad_norm": 0.34727150201797485, "learning_rate": 0.0002, "epoch": 1.2646566164154103, "step": 1510}, {"loss": 1.7804, "grad_norm": 0.3364993929862976, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1520}, {"loss": 1.8063, "grad_norm": 0.323483943939209, "learning_rate": 0.0002, "epoch": 1.2814070351758793, "step": 1530}, {"loss": 1.7622, "grad_norm": 0.4114733934402466, "learning_rate": 0.0002, "epoch": 1.2897822445561138, "step": 1540}, {"loss": 1.6525, "grad_norm": 0.37476620078086853, "learning_rate": 0.0002, "epoch": 1.2981574539363483, "step": 1550}, {"loss": 1.7225, "grad_norm": 0.4216269552707672, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1560}, {"loss": 1.6995, "grad_norm": 0.3204927444458008, "learning_rate": 0.0002, "epoch": 1.3149078726968173, "step": 1570}, {"loss": 1.7132, "grad_norm": 0.36916354298591614, "learning_rate": 0.0002, "epoch": 1.3232830820770518, "step": 1580}, {"loss": 1.7383, "grad_norm": 0.3755691647529602, "learning_rate": 0.0002, "epoch": 1.3316582914572863, "step": 1590}, {"loss": 1.7351, "grad_norm": 0.3688889443874359, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 1600}, {"loss": 1.7664, "grad_norm": 0.34306398034095764, "learning_rate": 0.0002, "epoch": 1.3484087102177553, "step": 1610}, {"loss": 1.6943, "grad_norm": 0.3651525676250458, "learning_rate": 0.0002, "epoch": 1.3567839195979898, "step": 1620}, {"loss": 1.7206, "grad_norm": 0.3461526036262512, "learning_rate": 0.0002, "epoch": 1.3651591289782243, "step": 1630}, {"loss": 1.728, "grad_norm": 0.37959185242652893, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 1640}, {"loss": 1.746, "grad_norm": 0.4005356431007385, "learning_rate": 0.0002, "epoch": 1.3819095477386933, "step": 1650}, {"loss": 1.694, "grad_norm": 0.3537434935569763, "learning_rate": 0.0002, "epoch": 1.3902847571189278, "step": 1660}, {"loss": 1.6679, "grad_norm": 0.38220855593681335, "learning_rate": 0.0002, "epoch": 1.3986599664991624, "step": 1670}, {"loss": 1.7721, "grad_norm": 0.3573434352874756, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 1680}, {"loss": 1.6983, "grad_norm": 0.40028059482574463, "learning_rate": 0.0002, "epoch": 1.4154103852596314, "step": 1690}, {"loss": 1.7049, "grad_norm": 0.3953610360622406, "learning_rate": 0.0002, "epoch": 1.4237855946398659, "step": 1700}, {"loss": 1.7126, "grad_norm": 0.39524543285369873, "learning_rate": 0.0002, "epoch": 1.4321608040201004, "step": 1710}, {"loss": 1.8319, "grad_norm": 0.37721359729766846, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 1720}, {"loss": 1.7387, "grad_norm": 0.4220093786716461, "learning_rate": 0.0002, "epoch": 1.4489112227805694, "step": 1730}, {"loss": 1.7495, "grad_norm": 0.3876369595527649, "learning_rate": 0.0002, "epoch": 1.457286432160804, "step": 1740}, {"loss": 1.6859, "grad_norm": 0.3774619400501251, "learning_rate": 0.0002, "epoch": 1.4656616415410384, "step": 1750}, {"loss": 1.7223, "grad_norm": 0.3608052432537079, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 1760}, {"loss": 1.6746, "grad_norm": 0.32083916664123535, "learning_rate": 0.0002, "epoch": 1.4824120603015074, "step": 1770}, {"loss": 1.716, "grad_norm": 0.32290884852409363, "learning_rate": 0.0002, "epoch": 1.490787269681742, "step": 1780}, {"loss": 1.7648, "grad_norm": 0.3537974953651428, "learning_rate": 0.0002, "epoch": 1.4991624790619764, "step": 1790}, {"loss": 1.6784, "grad_norm": 0.36576104164123535, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 1800}, {"loss": 1.6818, "grad_norm": 0.3336752653121948, "learning_rate": 0.0002, "epoch": 1.5159128978224454, "step": 1810}, {"loss": 1.7425, "grad_norm": 0.3551652431488037, "learning_rate": 0.0002, "epoch": 1.52428810720268, "step": 1820}, {"loss": 1.6997, "grad_norm": 0.43313586711883545, "learning_rate": 0.0002, "epoch": 1.5326633165829144, "step": 1830}, {"loss": 1.7358, "grad_norm": 0.39160311222076416, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 1840}, {"loss": 1.7709, "grad_norm": 0.38758179545402527, "learning_rate": 0.0002, "epoch": 1.5494137353433834, "step": 1850}, {"loss": 1.7768, "grad_norm": 0.3658832013607025, "learning_rate": 0.0002, "epoch": 1.557788944723618, "step": 1860}, {"loss": 1.7486, "grad_norm": 0.375372052192688, "learning_rate": 0.0002, "epoch": 1.5661641541038525, "step": 1870}, {"loss": 1.6555, "grad_norm": 0.3586942255496979, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 1880}, {"loss": 1.7314, "grad_norm": 0.3626467287540436, "learning_rate": 0.0002, "epoch": 1.5829145728643215, "step": 1890}, {"loss": 1.7943, "grad_norm": 0.4199363589286804, "learning_rate": 0.0002, "epoch": 1.591289782244556, "step": 1900}, {"loss": 1.6551, "grad_norm": 0.35646331310272217, "learning_rate": 0.0002, "epoch": 1.5996649916247905, "step": 1910}, {"loss": 1.7125, "grad_norm": 0.3465106189250946, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 1920}, {"loss": 1.8507, "grad_norm": 0.43392884731292725, "learning_rate": 0.0002, "epoch": 1.6164154103852595, "step": 1930}, {"loss": 1.7009, "grad_norm": 0.39187198877334595, "learning_rate": 0.0002, "epoch": 1.624790619765494, "step": 1940}, {"loss": 1.7202, "grad_norm": 0.3685080409049988, "learning_rate": 0.0002, "epoch": 1.6331658291457285, "step": 1950}, {"loss": 1.6607, "grad_norm": 0.4044491946697235, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 1960}, {"loss": 1.7234, "grad_norm": 0.4388049244880676, "learning_rate": 0.0002, "epoch": 1.6499162479061975, "step": 1970}, {"loss": 1.7178, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.658291457286432, "step": 1980}, {"loss": 1.75, "grad_norm": 0.3501148521900177, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1990}, {"loss": 1.7057, "grad_norm": 0.3751881718635559, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2000}, {"loss": 1.7209, "grad_norm": 0.3902788460254669, "learning_rate": 0.0002, "epoch": 1.6834170854271355, "step": 2010}, {"loss": 1.8517, "grad_norm": 0.39642134308815, "learning_rate": 0.0002, "epoch": 1.69179229480737, "step": 2020}, {"loss": 1.6623, "grad_norm": 0.35721203684806824, "learning_rate": 0.0002, "epoch": 1.7001675041876045, "step": 2030}, {"loss": 1.6988, "grad_norm": 0.360419899225235, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2040}, {"loss": 1.691, "grad_norm": 0.3755600154399872, "learning_rate": 0.0002, "epoch": 1.7169179229480735, "step": 2050}, {"loss": 1.6726, "grad_norm": 0.3939184844493866, "learning_rate": 0.0002, "epoch": 1.725293132328308, "step": 2060}, {"loss": 1.7326, "grad_norm": 0.33955490589141846, "learning_rate": 0.0002, "epoch": 1.7336683417085426, "step": 2070}, {"loss": 1.6794, "grad_norm": 0.35501939058303833, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2080}, {"loss": 1.7312, "grad_norm": 0.38298022747039795, "learning_rate": 0.0002, "epoch": 1.7504187604690116, "step": 2090}, {"loss": 1.6602, "grad_norm": 0.3472785949707031, "learning_rate": 0.0002, "epoch": 1.758793969849246, "step": 2100}, {"loss": 1.6671, "grad_norm": 0.3620430827140808, "learning_rate": 0.0002, "epoch": 1.7671691792294806, "step": 2110}, {"loss": 1.671, "grad_norm": 0.3795909881591797, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2120}, {"loss": 1.7193, "grad_norm": 0.3662523925304413, "learning_rate": 0.0002, "epoch": 1.7839195979899496, "step": 2130}, {"loss": 1.7764, "grad_norm": 0.4113886058330536, "learning_rate": 0.0002, "epoch": 1.792294807370184, "step": 2140}, {"loss": 1.6681, "grad_norm": 0.3765672743320465, "learning_rate": 0.0002, "epoch": 1.8006700167504186, "step": 2150}, {"loss": 1.7481, "grad_norm": 0.41623714566230774, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2160}, {"loss": 1.712, "grad_norm": 0.3724099099636078, "learning_rate": 0.0002, "epoch": 1.8174204355108876, "step": 2170}, {"loss": 1.6912, "grad_norm": 0.3990779221057892, "learning_rate": 0.0002, "epoch": 1.8257956448911221, "step": 2180}, {"loss": 1.7361, "grad_norm": 0.3677702844142914, "learning_rate": 0.0002, "epoch": 1.8341708542713566, "step": 2190}, {"loss": 1.6705, "grad_norm": 0.3944959342479706, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2200}, {"loss": 1.7619, "grad_norm": 0.3413957357406616, "learning_rate": 0.0002, "epoch": 1.8509212730318256, "step": 2210}, {"loss": 1.7069, "grad_norm": 0.40136098861694336, "learning_rate": 0.0002, "epoch": 1.8592964824120601, "step": 2220}, {"loss": 1.6865, "grad_norm": 0.3496319055557251, "learning_rate": 0.0002, "epoch": 1.8676716917922946, "step": 2230}, {"loss": 1.6906, "grad_norm": 0.3759860694408417, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2240}, {"loss": 1.8394, "grad_norm": 0.43556007742881775, "learning_rate": 0.0002, "epoch": 1.8844221105527639, "step": 2250}, {"loss": 1.66, "grad_norm": 0.3864828944206238, "learning_rate": 0.0002, "epoch": 1.8927973199329984, "step": 2260}, {"loss": 1.6502, "grad_norm": 0.396930456161499, "learning_rate": 0.0002, "epoch": 1.9011725293132329, "step": 2270}, {"loss": 1.838, "grad_norm": 0.37667879462242126, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2280}, {"loss": 1.7315, "grad_norm": 0.3539164066314697, "learning_rate": 0.0002, "epoch": 1.917922948073702, "step": 2290}, {"loss": 1.7589, "grad_norm": 0.40542101860046387, "learning_rate": 0.0002, "epoch": 1.9262981574539364, "step": 2300}, {"loss": 1.6795, "grad_norm": 0.37341606616973877, "learning_rate": 0.0002, "epoch": 1.934673366834171, "step": 2310}, {"loss": 1.7058, "grad_norm": 0.4011504352092743, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2320}, {"loss": 1.688, "grad_norm": 0.37934592366218567, "learning_rate": 0.0002, "epoch": 1.95142378559464, "step": 2330}, {"loss": 1.6699, "grad_norm": 0.32745009660720825, "learning_rate": 0.0002, "epoch": 1.9597989949748744, "step": 2340}, {"loss": 1.7673, "grad_norm": 0.38347750902175903, "learning_rate": 0.0002, "epoch": 1.968174204355109, "step": 2350}, {"loss": 1.7116, "grad_norm": 0.3945120871067047, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2360}, {"loss": 1.7559, "grad_norm": 0.4034058749675751, "learning_rate": 0.0002, "epoch": 1.984924623115578, "step": 2370}, {"loss": 1.7254, "grad_norm": 0.3546718955039978, "learning_rate": 0.0002, "epoch": 1.9932998324958124, "step": 2380}, {"eval_loss": 1.8061236143112183, "eval_runtime": 38.2113, "eval_samples_per_second": 13.478, "eval_steps_per_second": 1.701, "epoch": 2.0, "step": 2388}, {"loss": 1.7203, "grad_norm": 0.35184019804000854, "learning_rate": 0.0002, "epoch": 2.0016750418760467, "step": 2390}, {"loss": 1.6124, "grad_norm": 0.40416669845581055, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 2400}, {"loss": 1.6092, "grad_norm": 0.3824569880962372, "learning_rate": 0.0002, "epoch": 2.0184254606365157, "step": 2410}, {"loss": 1.641, "grad_norm": 0.42036163806915283, "learning_rate": 0.0002, "epoch": 2.0268006700167502, "step": 2420}, {"loss": 1.6176, "grad_norm": 0.40417996048927307, "learning_rate": 0.0002, "epoch": 2.0351758793969847, "step": 2430}, {"loss": 1.643, "grad_norm": 0.45298922061920166, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 2440}, {"loss": 1.653, "grad_norm": 0.48289841413497925, "learning_rate": 0.0002, "epoch": 2.0519262981574538, "step": 2450}, {"loss": 1.5275, "grad_norm": 0.43702399730682373, "learning_rate": 0.0002, "epoch": 2.0603015075376883, "step": 2460}, {"loss": 1.5825, "grad_norm": 0.49487054347991943, "learning_rate": 0.0002, "epoch": 2.0686767169179228, "step": 2470}, {"loss": 1.6552, "grad_norm": 0.40030500292778015, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 2480}, {"loss": 1.614, "grad_norm": 0.4664880037307739, "learning_rate": 0.0002, "epoch": 2.0854271356783918, "step": 2490}, {"loss": 1.6589, "grad_norm": 0.4111400842666626, "learning_rate": 0.0002, "epoch": 2.0938023450586263, "step": 2500}, {"loss": 1.5788, "grad_norm": 0.4155750572681427, "learning_rate": 0.0002, "epoch": 2.102177554438861, "step": 2510}, {"loss": 1.598, "grad_norm": 0.39257505536079407, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 2520}, {"loss": 1.65, "grad_norm": 0.4156777560710907, "learning_rate": 0.0002, "epoch": 2.11892797319933, "step": 2530}, {"loss": 1.6695, "grad_norm": 0.4025181233882904, "learning_rate": 0.0002, "epoch": 2.1273031825795643, "step": 2540}, {"loss": 1.6471, "grad_norm": 0.42347562313079834, "learning_rate": 0.0002, "epoch": 2.135678391959799, "step": 2550}, {"loss": 1.6014, "grad_norm": 0.47068294882774353, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 2560}, {"loss": 1.6468, "grad_norm": 0.44081777334213257, "learning_rate": 0.0002, "epoch": 2.152428810720268, "step": 2570}, {"loss": 1.641, "grad_norm": 0.44823798537254333, "learning_rate": 0.0002, "epoch": 2.1608040201005023, "step": 2580}, {"loss": 1.6287, "grad_norm": 0.40486326813697815, "learning_rate": 0.0002, "epoch": 2.169179229480737, "step": 2590}, {"loss": 1.6198, "grad_norm": 0.454236775636673, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 2600}, {"loss": 1.5885, "grad_norm": 0.42555344104766846, "learning_rate": 0.0002, "epoch": 2.185929648241206, "step": 2610}, {"loss": 1.6348, "grad_norm": 0.5607381463050842, "learning_rate": 0.0002, "epoch": 2.1943048576214403, "step": 2620}, {"loss": 1.6343, "grad_norm": 0.4095611870288849, "learning_rate": 0.0002, "epoch": 2.202680067001675, "step": 2630}, {"loss": 1.5584, "grad_norm": 0.419342577457428, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 2640}, {"loss": 1.5425, "grad_norm": 0.48541849851608276, "learning_rate": 0.0002, "epoch": 2.219430485762144, "step": 2650}, {"loss": 1.6233, "grad_norm": 0.4365246891975403, "learning_rate": 0.0002, "epoch": 2.2278056951423784, "step": 2660}, {"loss": 1.6886, "grad_norm": 0.46417000889778137, "learning_rate": 0.0002, "epoch": 2.236180904522613, "step": 2670}, {"loss": 1.6345, "grad_norm": 0.5034580230712891, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 2680}, {"loss": 1.5992, "grad_norm": 0.44852879643440247, "learning_rate": 0.0002, "epoch": 2.2529313232830823, "step": 2690}, {"loss": 1.6152, "grad_norm": 0.43886998295783997, "learning_rate": 0.0002, "epoch": 2.2613065326633164, "step": 2700}, {"loss": 1.6533, "grad_norm": 0.45762625336647034, "learning_rate": 0.0002, "epoch": 2.2696817420435513, "step": 2710}, {"loss": 1.5889, "grad_norm": 0.39429017901420593, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 2720}, {"loss": 1.6419, "grad_norm": 0.4420442581176758, "learning_rate": 0.0002, "epoch": 2.2864321608040203, "step": 2730}, {"loss": 1.6126, "grad_norm": 0.4327794015407562, "learning_rate": 0.0002, "epoch": 2.2948073701842544, "step": 2740}, {"loss": 1.6405, "grad_norm": 0.4303780198097229, "learning_rate": 0.0002, "epoch": 2.3031825795644894, "step": 2750}, {"loss": 1.6362, "grad_norm": 0.41379377245903015, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 2760}, {"loss": 1.6744, "grad_norm": 0.4821205735206604, "learning_rate": 0.0002, "epoch": 2.3199329983249584, "step": 2770}, {"loss": 1.6694, "grad_norm": 0.46232181787490845, "learning_rate": 0.0002, "epoch": 2.3283082077051924, "step": 2780}, {"loss": 1.6341, "grad_norm": 0.44937554001808167, "learning_rate": 0.0002, "epoch": 2.3366834170854274, "step": 2790}, {"loss": 1.6556, "grad_norm": 0.443250447511673, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 2800}, {"loss": 1.6874, "grad_norm": 0.4687805473804474, "learning_rate": 0.0002, "epoch": 2.3534338358458964, "step": 2810}, {"loss": 1.6445, "grad_norm": 0.435031920671463, "learning_rate": 0.0002, "epoch": 2.3618090452261304, "step": 2820}, {"loss": 1.6335, "grad_norm": 0.4949858784675598, "learning_rate": 0.0002, "epoch": 2.3701842546063654, "step": 2830}, {"loss": 1.6803, "grad_norm": 0.46349018812179565, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 2840}, {"loss": 1.6586, "grad_norm": 0.46377238631248474, "learning_rate": 0.0002, "epoch": 2.3869346733668344, "step": 2850}, {"loss": 1.5384, "grad_norm": 0.6111940741539001, "learning_rate": 0.0002, "epoch": 2.3953098827470685, "step": 2860}, {"loss": 1.6132, "grad_norm": 0.45090532302856445, "learning_rate": 0.0002, "epoch": 2.4036850921273034, "step": 2870}, {"loss": 1.6047, "grad_norm": 0.4762120842933655, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 2880}, {"loss": 1.6997, "grad_norm": 0.4397919774055481, "learning_rate": 0.0002, "epoch": 2.4204355108877724, "step": 2890}, {"loss": 1.6369, "grad_norm": 0.4765152335166931, "learning_rate": 0.0002, "epoch": 2.4288107202680065, "step": 2900}, {"loss": 1.5982, "grad_norm": 0.4347304403781891, "learning_rate": 0.0002, "epoch": 2.4371859296482414, "step": 2910}, {"loss": 1.6409, "grad_norm": 0.3918324410915375, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 2920}, {"loss": 1.5354, "grad_norm": 0.43932855129241943, "learning_rate": 0.0002, "epoch": 2.4539363484087104, "step": 2930}, {"loss": 1.6283, "grad_norm": 0.46946918964385986, "learning_rate": 0.0002, "epoch": 2.4623115577889445, "step": 2940}, {"loss": 1.6622, "grad_norm": 0.45169174671173096, "learning_rate": 0.0002, "epoch": 2.4706867671691795, "step": 2950}, {"loss": 1.6386, "grad_norm": 0.43488186597824097, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 2960}, {"loss": 1.6187, "grad_norm": 0.42297765612602234, "learning_rate": 0.0002, "epoch": 2.4874371859296485, "step": 2970}, {"loss": 1.5708, "grad_norm": 0.4546392560005188, "learning_rate": 0.0002, "epoch": 2.4958123953098825, "step": 2980}, {"loss": 1.5944, "grad_norm": 0.4236692488193512, "learning_rate": 0.0002, "epoch": 2.5041876046901175, "step": 2990}, {"loss": 1.6927, "grad_norm": 0.46421024203300476, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3000}, {"loss": 1.6686, "grad_norm": 0.5040220618247986, "learning_rate": 0.0002, "epoch": 2.5209380234505865, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4596138894557953, "learning_rate": 0.0002, "epoch": 2.5293132328308205, "step": 3020}, {"loss": 1.5936, "grad_norm": 0.4410228729248047, "learning_rate": 0.0002, "epoch": 2.5376884422110555, "step": 3030}, {"loss": 1.6336, "grad_norm": 0.553693413734436, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3040}, {"loss": 1.6377, "grad_norm": 0.41298043727874756, "learning_rate": 0.0002, "epoch": 2.5544388609715245, "step": 3050}, {"loss": 1.7196, "grad_norm": 0.4894513487815857, "learning_rate": 0.0002, "epoch": 2.5628140703517586, "step": 3060}, {"loss": 1.6106, "grad_norm": 0.5525603294372559, "learning_rate": 0.0002, "epoch": 2.5711892797319935, "step": 3070}, {"loss": 1.6089, "grad_norm": 0.5043630003929138, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3080}, {"loss": 1.5641, "grad_norm": 0.4690920412540436, "learning_rate": 0.0002, "epoch": 2.5879396984924625, "step": 3090}, {"loss": 1.6364, "grad_norm": 0.4358677566051483, "learning_rate": 0.0002, "epoch": 2.5963149078726966, "step": 3100}, {"loss": 1.6328, "grad_norm": 0.4621894061565399, "learning_rate": 0.0002, "epoch": 2.6046901172529315, "step": 3110}, {"loss": 1.7426, "grad_norm": 0.4639507532119751, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3120}, {"loss": 1.6492, "grad_norm": 0.45161309838294983, "learning_rate": 0.0002, "epoch": 2.6214405360134005, "step": 3130}, {"loss": 1.6221, "grad_norm": 0.49179261922836304, "learning_rate": 0.0002, "epoch": 2.6298157453936346, "step": 3140}, {"loss": 1.663, "grad_norm": 0.4739720821380615, "learning_rate": 0.0002, "epoch": 2.6381909547738696, "step": 3150}, {"loss": 1.616, "grad_norm": 0.468252956867218, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3160}, {"loss": 1.705, "grad_norm": 0.44691553711891174, "learning_rate": 0.0002, "epoch": 2.6549413735343386, "step": 3170}, {"loss": 1.6558, "grad_norm": 0.47537046670913696, "learning_rate": 0.0002, "epoch": 2.6633165829145726, "step": 3180}, {"loss": 1.6755, "grad_norm": 0.4445202052593231, "learning_rate": 0.0002, "epoch": 2.6716917922948076, "step": 3190}, {"loss": 1.6522, "grad_norm": 0.46785518527030945, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 3200}, {"loss": 1.6711, "grad_norm": 0.4807088077068329, "learning_rate": 0.0002, "epoch": 2.6884422110552766, "step": 3210}, {"loss": 1.6385, "grad_norm": 0.4547516703605652, "learning_rate": 0.0002, "epoch": 2.6968174204355106, "step": 3220}, {"loss": 1.6084, "grad_norm": 0.5200821161270142, "learning_rate": 0.0002, "epoch": 2.7051926298157456, "step": 3230}, {"loss": 1.6434, "grad_norm": 0.4915551245212555, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 3240}, {"loss": 1.6146, "grad_norm": 0.4324817955493927, "learning_rate": 0.0002, "epoch": 2.7219430485762146, "step": 3250}, {"loss": 1.6154, "grad_norm": 0.6290464997291565, "learning_rate": 0.0002, "epoch": 2.7303182579564487, "step": 3260}, {"loss": 1.611, "grad_norm": 0.42255541682243347, "learning_rate": 0.0002, "epoch": 2.7386934673366836, "step": 3270}, {"loss": 1.6345, "grad_norm": 0.47089505195617676, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 3280}, {"loss": 1.6357, "grad_norm": 0.4492960572242737, "learning_rate": 0.0002, "epoch": 2.7554438860971526, "step": 3290}, {"loss": 1.652, "grad_norm": 0.4711938202381134, "learning_rate": 0.0002, "epoch": 2.7638190954773867, "step": 3300}, {"loss": 1.6107, "grad_norm": 0.4635316729545593, "learning_rate": 0.0002, "epoch": 2.7721943048576216, "step": 3310}, {"loss": 1.6044, "grad_norm": 0.4207742512226105, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 3320}, {"loss": 1.6163, "grad_norm": 0.5545504093170166, "learning_rate": 0.0002, "epoch": 2.7889447236180906, "step": 3330}, {"loss": 1.6642, "grad_norm": 0.46976953744888306, "learning_rate": 0.0002, "epoch": 2.7973199329983247, "step": 3340}, {"loss": 1.6879, "grad_norm": 0.4805937111377716, "learning_rate": 0.0002, "epoch": 2.8056951423785597, "step": 3350}, {"loss": 1.6185, "grad_norm": 0.4986467659473419, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 3360}, {"loss": 1.6125, "grad_norm": 0.44702932238578796, "learning_rate": 0.0002, "epoch": 2.8224455611390287, "step": 3370}, {"loss": 1.6318, "grad_norm": 0.4698854088783264, "learning_rate": 0.0002, "epoch": 2.8308207705192627, "step": 3380}, {"loss": 1.6468, "grad_norm": 0.5756528377532959, "learning_rate": 0.0002, "epoch": 2.8391959798994977, "step": 3390}, {"loss": 1.6783, "grad_norm": 0.4266531765460968, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 3400}, {"loss": 1.6351, "grad_norm": 0.5342442989349365, "learning_rate": 0.0002, "epoch": 2.8559463986599667, "step": 3410}, {"loss": 1.659, "grad_norm": 0.47210443019866943, "learning_rate": 0.0002, "epoch": 2.8643216080402008, "step": 3420}, {"loss": 1.6157, "grad_norm": 0.4491795599460602, "learning_rate": 0.0002, "epoch": 2.8726968174204357, "step": 3430}, {"loss": 1.6179, "grad_norm": 0.5387647151947021, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 3440}, {"loss": 1.6415, "grad_norm": 0.5059208273887634, "learning_rate": 0.0002, "epoch": 2.8894472361809047, "step": 3450}, {"loss": 1.6577, "grad_norm": 0.472605437040329, "learning_rate": 0.0002, "epoch": 2.8978224455611388, "step": 3460}, {"loss": 1.6831, "grad_norm": 0.499795138835907, "learning_rate": 0.0002, "epoch": 2.9061976549413737, "step": 3470}, {"loss": 1.6198, "grad_norm": 0.4887969493865967, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4670022130012512, "learning_rate": 0.0002, "epoch": 2.9229480737018427, "step": 3490}, {"loss": 1.6355, "grad_norm": 0.4475444555282593, "learning_rate": 0.0002, "epoch": 2.931323283082077, "step": 3500}, {"loss": 1.6669, "grad_norm": 0.39244669675827026, "learning_rate": 0.0002, "epoch": 2.9396984924623117, "step": 3510}, {"loss": 1.6094, "grad_norm": 0.4905056059360504, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 3520}, {"loss": 1.5774, "grad_norm": 0.4395551085472107, "learning_rate": 0.0002, "epoch": 2.9564489112227808, "step": 3530}, {"loss": 1.6047, "grad_norm": 0.4693661034107208, "learning_rate": 0.0002, "epoch": 2.964824120603015, "step": 3540}, {"loss": 1.648, "grad_norm": 0.473781943321228, "learning_rate": 0.0002, "epoch": 2.9731993299832498, "step": 3550}, {"loss": 1.7056, "grad_norm": 0.4374050796031952, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 3560}, {"loss": 1.6816, "grad_norm": 0.46144190430641174, "learning_rate": 0.0002, "epoch": 2.9899497487437188, "step": 3570}, {"loss": 1.5454, "grad_norm": 0.43887680768966675, "learning_rate": 0.0002, "epoch": 2.998324958123953, "step": 3580}, {"eval_loss": 1.8283122777938843, "eval_runtime": 38.023, "eval_samples_per_second": 13.544, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 3582}, {"loss": 1.5874, "grad_norm": 0.6784713268280029, "learning_rate": 0.0002, "epoch": 3.006700167504188, "step": 3590}, {"loss": 1.5813, "grad_norm": 0.5783940553665161, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 3600}, {"loss": 1.4769, "grad_norm": 0.5408937335014343, "learning_rate": 0.0002, "epoch": 3.023450586264657, "step": 3610}, {"loss": 1.526, "grad_norm": 0.5229013562202454, "learning_rate": 0.0002, "epoch": 3.0318257956448913, "step": 3620}, {"loss": 1.4835, "grad_norm": 0.49160143733024597, "learning_rate": 0.0002, "epoch": 3.040201005025126, "step": 3630}, {"loss": 1.5398, "grad_norm": 0.6563201546669006, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 3640}, {"loss": 1.448, "grad_norm": 0.5686020851135254, "learning_rate": 0.0002, "epoch": 3.056951423785595, "step": 3650}, {"loss": 1.4541, "grad_norm": 0.5774043202400208, "learning_rate": 0.0002, "epoch": 3.0653266331658293, "step": 3660}, {"loss": 1.4734, "grad_norm": 0.6106171011924744, "learning_rate": 0.0002, "epoch": 3.073701842546064, "step": 3670}, {"loss": 1.4961, "grad_norm": 0.517433226108551, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 3680}, {"loss": 1.4961, "grad_norm": 0.5681702494621277, "learning_rate": 0.0002, "epoch": 3.090452261306533, "step": 3690}, {"loss": 1.4731, "grad_norm": 0.5769233107566833, "learning_rate": 0.0002, "epoch": 3.0988274706867673, "step": 3700}, {"loss": 1.4836, "grad_norm": 0.5657462477684021, "learning_rate": 0.0002, "epoch": 3.107202680067002, "step": 3710}, {"loss": 1.4526, "grad_norm": 0.6035246253013611, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 3720}, {"loss": 1.5102, "grad_norm": 0.7286643385887146, "learning_rate": 0.0002, "epoch": 3.123953098827471, "step": 3730}, {"loss": 1.4444, "grad_norm": 0.5121201872825623, "learning_rate": 0.0002, "epoch": 3.1323283082077054, "step": 3740}, {"loss": 1.565, "grad_norm": 0.5074213147163391, "learning_rate": 0.0002, "epoch": 3.14070351758794, "step": 3750}, {"loss": 1.4729, "grad_norm": 0.57481849193573, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 3760}, {"loss": 1.4765, "grad_norm": 0.6326663494110107, "learning_rate": 0.0002, "epoch": 3.157453936348409, "step": 3770}, {"loss": 1.4888, "grad_norm": 0.6039315462112427, "learning_rate": 0.0002, "epoch": 3.1658291457286434, "step": 3780}, {"loss": 1.5084, "grad_norm": 0.6936715245246887, "learning_rate": 0.0002, "epoch": 3.174204355108878, "step": 3790}, {"loss": 1.4879, "grad_norm": 0.6516796946525574, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 3800}, {"loss": 1.578, "grad_norm": 0.6140730977058411, "learning_rate": 0.0002, "epoch": 3.190954773869347, "step": 3810}, {"loss": 1.5101, "grad_norm": 0.631328284740448, "learning_rate": 0.0002, "epoch": 3.1993299832495814, "step": 3820}, {"loss": 1.4844, "grad_norm": 0.6265402436256409, "learning_rate": 0.0002, "epoch": 3.207705192629816, "step": 3830}, {"loss": 1.5332, "grad_norm": 0.6649428606033325, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 3840}, {"loss": 1.5231, "grad_norm": 0.5329259634017944, "learning_rate": 0.0002, "epoch": 3.224455611390285, "step": 3850}, {"loss": 1.5714, "grad_norm": 0.6008304953575134, "learning_rate": 0.0002, "epoch": 3.2328308207705194, "step": 3860}, {"loss": 1.5214, "grad_norm": 0.5918582081794739, "learning_rate": 0.0002, "epoch": 3.241206030150754, "step": 3870}, {"loss": 1.571, "grad_norm": 0.643622100353241, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 3880}, {"loss": 1.5274, "grad_norm": 0.5517964363098145, "learning_rate": 0.0002, "epoch": 3.257956448911223, "step": 3890}, {"loss": 1.5458, "grad_norm": 0.6780755519866943, "learning_rate": 0.0002, "epoch": 3.2663316582914574, "step": 3900}, {"loss": 1.5743, "grad_norm": 0.6742202639579773, "learning_rate": 0.0002, "epoch": 3.274706867671692, "step": 3910}, {"loss": 1.5279, "grad_norm": 0.6228749752044678, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 3920}, {"loss": 1.4899, "grad_norm": 0.5836303234100342, "learning_rate": 0.0002, "epoch": 3.291457286432161, "step": 3930}, {"loss": 1.5445, "grad_norm": 0.6337724328041077, "learning_rate": 0.0002, "epoch": 3.2998324958123955, "step": 3940}, {"loss": 1.5618, "grad_norm": 0.6345084309577942, "learning_rate": 0.0002, "epoch": 3.30820770519263, "step": 3950}, {"loss": 1.4224, "grad_norm": 0.6125303506851196, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 3960}, {"loss": 1.5355, "grad_norm": 0.6259911060333252, "learning_rate": 0.0002, "epoch": 3.324958123953099, "step": 3970}, {"loss": 1.5427, "grad_norm": 0.645745575428009, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 3980}, {"loss": 1.5817, "grad_norm": 0.6666176915168762, "learning_rate": 0.0002, "epoch": 3.341708542713568, "step": 3990}, {"loss": 1.4998, "grad_norm": 0.59013831615448, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 4000}, {"loss": 1.4921, "grad_norm": 0.6604634523391724, "learning_rate": 0.0002, "epoch": 3.358458961474037, "step": 4010}, {"loss": 1.5076, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 3.3668341708542715, "step": 4020}, {"loss": 1.4801, "grad_norm": 0.515724778175354, "learning_rate": 0.0002, "epoch": 3.375209380234506, "step": 4030}, {"loss": 1.4932, "grad_norm": 0.681968092918396, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 4040}, {"loss": 1.5148, "grad_norm": 0.5978158116340637, "learning_rate": 0.0002, "epoch": 3.391959798994975, "step": 4050}, {"loss": 1.5449, "grad_norm": 0.6043432354927063, "learning_rate": 0.0002, "epoch": 3.4003350083752095, "step": 4060}, {"loss": 1.5021, "grad_norm": 0.5899770855903625, "learning_rate": 0.0002, "epoch": 3.408710217755444, "step": 4070}, {"loss": 1.5992, "grad_norm": 0.6014242172241211, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 4080}, {"loss": 1.4692, "grad_norm": 0.5944811105728149, "learning_rate": 0.0002, "epoch": 3.425460636515913, "step": 4090}, {"loss": 1.5877, "grad_norm": 0.6506822109222412, "learning_rate": 0.0002, "epoch": 3.4338358458961475, "step": 4100}, {"loss": 1.5144, "grad_norm": 0.6926528811454773, "learning_rate": 0.0002, "epoch": 3.442211055276382, "step": 4110}, {"loss": 1.5169, "grad_norm": 0.5646378993988037, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 4120}, {"loss": 1.5032, "grad_norm": 0.7233654856681824, "learning_rate": 0.0002, "epoch": 3.458961474036851, "step": 4130}, {"loss": 1.5161, "grad_norm": 0.6231815814971924, "learning_rate": 0.0002, "epoch": 3.4673366834170856, "step": 4140}, {"loss": 1.5349, "grad_norm": 0.6115689873695374, "learning_rate": 0.0002, "epoch": 3.47571189279732, "step": 4150}, {"loss": 1.4621, "grad_norm": 0.5812674760818481, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 4160}, {"loss": 1.5465, "grad_norm": 0.6099632978439331, "learning_rate": 0.0002, "epoch": 3.492462311557789, "step": 4170}, {"loss": 1.4795, "grad_norm": 0.6102647185325623, "learning_rate": 0.0002, "epoch": 3.5008375209380236, "step": 4180}, {"loss": 1.5305, "grad_norm": 0.6034680008888245, "learning_rate": 0.0002, "epoch": 3.509212730318258, "step": 4190}, {"loss": 1.5093, "grad_norm": 0.6281666159629822, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 4200}, {"loss": 1.4903, "grad_norm": 0.6245372295379639, "learning_rate": 0.0002, "epoch": 3.525963149078727, "step": 4210}, {"loss": 1.5098, "grad_norm": 0.5897293090820312, "learning_rate": 0.0002, "epoch": 3.5343383584589616, "step": 4220}, {"loss": 1.5991, "grad_norm": 0.601054847240448, "learning_rate": 0.0002, "epoch": 3.542713567839196, "step": 4230}, {"loss": 1.4974, "grad_norm": 0.7004473805427551, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 4240}, {"loss": 1.5993, "grad_norm": 0.6601553559303284, "learning_rate": 0.0002, "epoch": 3.559463986599665, "step": 4250}, {"loss": 1.4961, "grad_norm": 0.6112467050552368, "learning_rate": 0.0002, "epoch": 3.5678391959798996, "step": 4260}, {"loss": 1.4967, "grad_norm": 0.5902454853057861, "learning_rate": 0.0002, "epoch": 3.576214405360134, "step": 4270}, {"loss": 1.5659, "grad_norm": 0.5792450904846191, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 4280}, {"loss": 1.4664, "grad_norm": 0.5923888087272644, "learning_rate": 0.0002, "epoch": 3.592964824120603, "step": 4290}, {"loss": 1.5155, "grad_norm": 0.5869482159614563, "learning_rate": 0.0002, "epoch": 3.6013400335008376, "step": 4300}, {"loss": 1.5119, "grad_norm": 0.6372929811477661, "learning_rate": 0.0002, "epoch": 3.609715242881072, "step": 4310}, {"loss": 1.4977, "grad_norm": 0.6350686550140381, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 4320}, {"loss": 1.5226, "grad_norm": 0.571819007396698, "learning_rate": 0.0002, "epoch": 3.626465661641541, "step": 4330}, {"loss": 1.5414, "grad_norm": 0.592250645160675, "learning_rate": 0.0002, "epoch": 3.6348408710217757, "step": 4340}, {"loss": 1.4912, "grad_norm": 0.6110650897026062, "learning_rate": 0.0002, "epoch": 3.64321608040201, "step": 4350}, {"loss": 1.6089, "grad_norm": 0.6187081336975098, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 4360}, {"loss": 1.5345, "grad_norm": 0.6197671890258789, "learning_rate": 0.0002, "epoch": 3.659966499162479, "step": 4370}, {"loss": 1.4988, "grad_norm": 0.6050862669944763, "learning_rate": 0.0002, "epoch": 3.6683417085427137, "step": 4380}, {"loss": 1.4872, "grad_norm": 0.621265172958374, "learning_rate": 0.0002, "epoch": 3.676716917922948, "step": 4390}, {"loss": 1.6011, "grad_norm": 0.6552940011024475, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 4400}, {"loss": 1.4344, "grad_norm": 0.5638861060142517, "learning_rate": 0.0002, "epoch": 3.693467336683417, "step": 4410}, {"loss": 1.4985, "grad_norm": 0.6388863325119019, "learning_rate": 0.0002, "epoch": 3.7018425460636517, "step": 4420}, {"loss": 1.3696, "grad_norm": 0.6062559485435486, "learning_rate": 0.0002, "epoch": 3.710217755443886, "step": 4430}, {"loss": 1.5101, "grad_norm": 0.5800350308418274, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 4440}, {"loss": 1.5286, "grad_norm": 0.5954474210739136, "learning_rate": 0.0002, "epoch": 3.726968174204355, "step": 4450}, {"loss": 1.6133, "grad_norm": 0.5880125761032104, "learning_rate": 0.0002, "epoch": 3.7353433835845897, "step": 4460}, {"loss": 1.5055, "grad_norm": 0.5880921483039856, "learning_rate": 0.0002, "epoch": 3.7437185929648242, "step": 4470}, {"loss": 1.5728, "grad_norm": 0.5995073914527893, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 4480}, {"loss": 1.554, "grad_norm": 0.5958493947982788, "learning_rate": 0.0002, "epoch": 3.7604690117252932, "step": 4490}, {"loss": 1.5472, "grad_norm": 0.5694711804389954, "learning_rate": 0.0002, "epoch": 3.7688442211055277, "step": 4500}, {"loss": 1.5105, "grad_norm": 0.6175141930580139, "learning_rate": 0.0002, "epoch": 3.7772194304857623, "step": 4510}, {"loss": 1.5404, "grad_norm": 0.5541581511497498, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 4520}, {"loss": 1.5283, "grad_norm": 0.5986164808273315, "learning_rate": 0.0002, "epoch": 3.7939698492462313, "step": 4530}, {"loss": 1.4961, "grad_norm": 0.640072226524353, "learning_rate": 0.0002, "epoch": 3.8023450586264658, "step": 4540}, {"loss": 1.5297, "grad_norm": 0.5742579698562622, "learning_rate": 0.0002, "epoch": 3.8107202680067003, "step": 4550}, {"loss": 1.5591, "grad_norm": 0.6658656001091003, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 4560}, {"loss": 1.4992, "grad_norm": 0.5475369691848755, "learning_rate": 0.0002, "epoch": 3.8274706867671693, "step": 4570}, {"loss": 1.5966, "grad_norm": 0.613172173500061, "learning_rate": 0.0002, "epoch": 3.835845896147404, "step": 4580}, {"loss": 1.5594, "grad_norm": 0.590968132019043, "learning_rate": 0.0002, "epoch": 3.8442211055276383, "step": 4590}, {"loss": 1.5067, "grad_norm": 0.5865461826324463, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 4600}, {"loss": 1.5247, "grad_norm": 0.6815178990364075, "learning_rate": 0.0002, "epoch": 3.8609715242881073, "step": 4610}, {"loss": 1.5702, "grad_norm": 0.6551400423049927, "learning_rate": 0.0002, "epoch": 3.869346733668342, "step": 4620}, {"loss": 1.4891, "grad_norm": 0.6398897171020508, "learning_rate": 0.0002, "epoch": 3.8777219430485763, "step": 4630}, {"loss": 1.5353, "grad_norm": 0.6761762499809265, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 4640}, {"loss": 1.6071, "grad_norm": 0.6277294754981995, "learning_rate": 0.0002, "epoch": 3.8944723618090453, "step": 4650}, {"loss": 1.5605, "grad_norm": 0.6285301446914673, "learning_rate": 0.0002, "epoch": 3.90284757118928, "step": 4660}, {"loss": 1.5937, "grad_norm": 0.5416069626808167, "learning_rate": 0.0002, "epoch": 3.9112227805695143, "step": 4670}, {"loss": 1.5461, "grad_norm": 0.6314545273780823, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 4680}, {"loss": 1.4828, "grad_norm": 0.604479968547821, "learning_rate": 0.0002, "epoch": 3.9279731993299833, "step": 4690}, {"loss": 1.5186, "grad_norm": 0.5321660041809082, "learning_rate": 0.0002, "epoch": 3.936348408710218, "step": 4700}, {"loss": 1.4696, "grad_norm": 0.6632516980171204, "learning_rate": 0.0002, "epoch": 3.9447236180904524, "step": 4710}, {"loss": 1.519, "grad_norm": 0.5925896763801575, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 4720}, {"loss": 1.5716, "grad_norm": 0.6580308675765991, "learning_rate": 0.0002, "epoch": 3.9614740368509214, "step": 4730}, {"loss": 1.4462, "grad_norm": 0.5578170418739319, "learning_rate": 0.0002, "epoch": 3.969849246231156, "step": 4740}, {"loss": 1.5394, "grad_norm": 0.6216608285903931, "learning_rate": 0.0002, "epoch": 3.9782244556113904, "step": 4750}, {"loss": 1.5395, "grad_norm": 0.5693069696426392, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 4760}, {"loss": 1.5517, "grad_norm": 0.5353434681892395, "learning_rate": 0.0002, "epoch": 3.9949748743718594, "step": 4770}, {"eval_loss": 1.8809821605682373, "eval_runtime": 37.9695, "eval_samples_per_second": 13.564, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4776}, {"loss": 1.4608, "grad_norm": 0.6117817759513855, "learning_rate": 0.0002, "epoch": 4.0033500837520934, "step": 4780}, {"loss": 1.2982, "grad_norm": 0.6816073656082153, "learning_rate": 0.0002, "epoch": 4.011725293132328, "step": 4790}, {"loss": 1.3464, "grad_norm": 0.715548038482666, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 4800}, {"loss": 1.3918, "grad_norm": 0.8585814833641052, "learning_rate": 0.0002, "epoch": 4.028475711892797, "step": 4810}, {"loss": 1.4137, "grad_norm": 0.7372158765792847, "learning_rate": 0.0002, "epoch": 4.0368509212730315, "step": 4820}, {"loss": 1.3769, "grad_norm": 0.8915117979049683, "learning_rate": 0.0002, "epoch": 4.045226130653266, "step": 4830}, {"loss": 1.3551, "grad_norm": 0.9323588013648987, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 4840}, {"loss": 1.3687, "grad_norm": 0.9298437237739563, "learning_rate": 0.0002, "epoch": 4.061976549413735, "step": 4850}, {"loss": 1.4173, "grad_norm": 0.8541792035102844, "learning_rate": 0.0002, "epoch": 4.0703517587939695, "step": 4860}, {"loss": 1.3668, "grad_norm": 0.7833571434020996, "learning_rate": 0.0002, "epoch": 4.078726968174204, "step": 4870}, {"loss": 1.3835, "grad_norm": 0.9325295090675354, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 4880}, {"loss": 1.3834, "grad_norm": 0.7066370248794556, "learning_rate": 0.0002, "epoch": 4.0954773869346734, "step": 4890}, {"loss": 1.3661, "grad_norm": 0.712640643119812, "learning_rate": 0.0002, "epoch": 4.1038525963149075, "step": 4900}, {"loss": 1.3637, "grad_norm": 0.6970218420028687, "learning_rate": 0.0002, "epoch": 4.1122278056951425, "step": 4910}, {"loss": 1.3805, "grad_norm": 0.7979312539100647, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 4920}, {"loss": 1.4115, "grad_norm": 0.7801558375358582, "learning_rate": 0.0002, "epoch": 4.1289782244556115, "step": 4930}, {"loss": 1.3288, "grad_norm": 0.7505159974098206, "learning_rate": 0.0002, "epoch": 4.1373534338358455, "step": 4940}, {"loss": 1.3453, "grad_norm": 0.738201916217804, "learning_rate": 0.0002, "epoch": 4.1457286432160805, "step": 4950}, {"loss": 1.3418, "grad_norm": 0.7736659049987793, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 4960}, {"loss": 1.3663, "grad_norm": 0.7850064635276794, "learning_rate": 0.0002, "epoch": 4.1624790619765495, "step": 4970}, {"loss": 1.326, "grad_norm": 0.8316620588302612, "learning_rate": 0.0002, "epoch": 4.1708542713567835, "step": 4980}, {"loss": 1.377, "grad_norm": 0.7217330932617188, "learning_rate": 0.0002, "epoch": 4.1792294807370185, "step": 4990}, {"loss": 1.3299, "grad_norm": 0.7050199508666992, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 5000}, {"loss": 1.3798, "grad_norm": 0.6992659568786621, "learning_rate": 0.0002, "epoch": 4.1959798994974875, "step": 5010}, {"loss": 1.3391, "grad_norm": 0.7648445963859558, "learning_rate": 0.0002, "epoch": 4.204355108877722, "step": 5020}, {"loss": 1.3339, "grad_norm": 0.8093137741088867, "learning_rate": 0.0002, "epoch": 4.2127303182579565, "step": 5030}, {"loss": 1.37, "grad_norm": 0.6907750368118286, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 5040}, {"loss": 1.4231, "grad_norm": 0.7000078558921814, "learning_rate": 0.0002, "epoch": 4.2294807370184255, "step": 5050}, {"loss": 1.3411, "grad_norm": 0.715034008026123, "learning_rate": 0.0002, "epoch": 4.23785594639866, "step": 5060}, {"loss": 1.3795, "grad_norm": 0.828895628452301, "learning_rate": 0.0002, "epoch": 4.2462311557788945, "step": 5070}, {"loss": 1.3397, "grad_norm": 0.7127292156219482, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 5080}, {"loss": 1.4255, "grad_norm": 0.8256623148918152, "learning_rate": 0.0002, "epoch": 4.2629815745393635, "step": 5090}, {"loss": 1.4078, "grad_norm": 0.8062452077865601, "learning_rate": 0.0002, "epoch": 4.271356783919598, "step": 5100}, {"loss": 1.3705, "grad_norm": 0.6861081123352051, "learning_rate": 0.0002, "epoch": 4.279731993299833, "step": 5110}, {"loss": 1.3463, "grad_norm": 0.7566041350364685, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 5120}, {"loss": 1.4571, "grad_norm": 0.8734753727912903, "learning_rate": 0.0002, "epoch": 4.296482412060302, "step": 5130}, {"loss": 1.4747, "grad_norm": 0.8559320569038391, "learning_rate": 0.0002, "epoch": 4.304857621440536, "step": 5140}, {"loss": 1.3551, "grad_norm": 0.6965576410293579, "learning_rate": 0.0002, "epoch": 4.313232830820771, "step": 5150}, {"loss": 1.3485, "grad_norm": 0.8277813792228699, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 5160}, {"loss": 1.3433, "grad_norm": 1.0733633041381836, "learning_rate": 0.0002, "epoch": 4.32998324958124, "step": 5170}, {"loss": 1.3953, "grad_norm": 0.7914809584617615, "learning_rate": 0.0002, "epoch": 4.338358458961474, "step": 5180}, {"loss": 1.3907, "grad_norm": 0.8307849168777466, "learning_rate": 0.0002, "epoch": 4.346733668341709, "step": 5190}, {"loss": 1.4318, "grad_norm": 0.7066516280174255, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 5200}, {"loss": 1.3866, "grad_norm": 0.9676792025566101, "learning_rate": 0.0002, "epoch": 4.363484087102178, "step": 5210}, {"loss": 1.3973, "grad_norm": 0.7672301530838013, "learning_rate": 0.0002, "epoch": 4.371859296482412, "step": 5220}, {"loss": 1.3576, "grad_norm": 0.6888260245323181, "learning_rate": 0.0002, "epoch": 4.380234505862647, "step": 5230}, {"loss": 1.3815, "grad_norm": 0.8775295615196228, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 5240}, {"loss": 1.3224, "grad_norm": 0.8742642998695374, "learning_rate": 0.0002, "epoch": 4.396984924623116, "step": 5250}, {"loss": 1.4609, "grad_norm": 0.6935433745384216, "learning_rate": 0.0002, "epoch": 4.40536013400335, "step": 5260}, {"loss": 1.3605, "grad_norm": 0.7726178169250488, "learning_rate": 0.0002, "epoch": 4.413735343383585, "step": 5270}, {"loss": 1.4591, "grad_norm": 0.7493860721588135, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 5280}, {"loss": 1.3277, "grad_norm": 0.7758517265319824, "learning_rate": 0.0002, "epoch": 4.430485762144054, "step": 5290}, {"loss": 1.2916, "grad_norm": 0.779315173625946, "learning_rate": 0.0002, "epoch": 4.438860971524288, "step": 5300}, {"loss": 1.4483, "grad_norm": 0.7753667235374451, "learning_rate": 0.0002, "epoch": 4.447236180904523, "step": 5310}, {"loss": 1.2513, "grad_norm": 0.8738188743591309, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 5320}, {"loss": 1.41, "grad_norm": 0.8410757184028625, "learning_rate": 0.0002, "epoch": 4.463986599664992, "step": 5330}, {"loss": 1.3809, "grad_norm": 0.728897750377655, "learning_rate": 0.0002, "epoch": 4.472361809045226, "step": 5340}, {"loss": 1.4049, "grad_norm": 0.7880531549453735, "learning_rate": 0.0002, "epoch": 4.480737018425461, "step": 5350}, {"loss": 1.4106, "grad_norm": 0.8455142378807068, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 5360}, {"loss": 1.431, "grad_norm": 0.8527868986129761, "learning_rate": 0.0002, "epoch": 4.49748743718593, "step": 5370}, {"loss": 1.3586, "grad_norm": 0.7743009328842163, "learning_rate": 0.0002, "epoch": 4.505862646566165, "step": 5380}, {"loss": 1.4175, "grad_norm": 0.7555320858955383, "learning_rate": 0.0002, "epoch": 4.514237855946399, "step": 5390}, {"loss": 1.3433, "grad_norm": 0.8146619200706482, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 5400}, {"loss": 1.4859, "grad_norm": 0.8042502999305725, "learning_rate": 0.0002, "epoch": 4.530988274706868, "step": 5410}, {"loss": 1.3843, "grad_norm": 0.7329140305519104, "learning_rate": 0.0002, "epoch": 4.539363484087103, "step": 5420}, {"loss": 1.3946, "grad_norm": 0.7574753165245056, "learning_rate": 0.0002, "epoch": 4.547738693467337, "step": 5430}, {"loss": 1.3048, "grad_norm": 1.1223409175872803, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 5440}, {"loss": 1.4067, "grad_norm": 0.7647369503974915, "learning_rate": 0.0002, "epoch": 4.564489112227806, "step": 5450}, {"loss": 1.4569, "grad_norm": 0.9135531187057495, "learning_rate": 0.0002, "epoch": 4.572864321608041, "step": 5460}, {"loss": 1.4813, "grad_norm": 0.9343693852424622, "learning_rate": 0.0002, "epoch": 4.581239530988275, "step": 5470}, {"loss": 1.385, "grad_norm": 0.869945764541626, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 5480}, {"loss": 1.4067, "grad_norm": 0.7383785843849182, "learning_rate": 0.0002, "epoch": 4.597989949748744, "step": 5490}, {"loss": 1.3698, "grad_norm": 0.7988699674606323, "learning_rate": 0.0002, "epoch": 4.606365159128979, "step": 5500}, {"loss": 1.3834, "grad_norm": 0.8731256127357483, "learning_rate": 0.0002, "epoch": 4.614740368509213, "step": 5510}, {"loss": 1.4393, "grad_norm": 0.7577664256095886, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 5520}, {"loss": 1.4418, "grad_norm": 0.7825039625167847, "learning_rate": 0.0002, "epoch": 4.631490787269682, "step": 5530}, {"loss": 1.4594, "grad_norm": 0.8534902930259705, "learning_rate": 0.0002, "epoch": 4.639865996649917, "step": 5540}, {"loss": 1.3689, "grad_norm": 0.7403318285942078, "learning_rate": 0.0002, "epoch": 4.648241206030151, "step": 5550}, {"loss": 1.4456, "grad_norm": 0.8229990005493164, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 5560}, {"loss": 1.3854, "grad_norm": 0.8279513716697693, "learning_rate": 0.0002, "epoch": 4.66499162479062, "step": 5570}, {"loss": 1.4472, "grad_norm": 0.8923851251602173, "learning_rate": 0.0002, "epoch": 4.673366834170855, "step": 5580}, {"loss": 1.3999, "grad_norm": 0.7457540035247803, "learning_rate": 0.0002, "epoch": 4.681742043551089, "step": 5590}, {"loss": 1.4341, "grad_norm": 0.7110715508460999, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 5600}, {"loss": 1.4327, "grad_norm": 0.7135499119758606, "learning_rate": 0.0002, "epoch": 4.698492462311558, "step": 5610}, {"loss": 1.4321, "grad_norm": 0.7606837153434753, "learning_rate": 0.0002, "epoch": 4.706867671691793, "step": 5620}, {"loss": 1.3792, "grad_norm": 0.9622916579246521, "learning_rate": 0.0002, "epoch": 4.715242881072027, "step": 5630}, {"loss": 1.4, "grad_norm": 0.7665684819221497, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 5640}, {"loss": 1.3837, "grad_norm": 0.7985475659370422, "learning_rate": 0.0002, "epoch": 4.731993299832496, "step": 5650}, {"loss": 1.397, "grad_norm": 0.9179279208183289, "learning_rate": 0.0002, "epoch": 4.740368509212731, "step": 5660}, {"loss": 1.4379, "grad_norm": 0.8311634063720703, "learning_rate": 0.0002, "epoch": 4.748743718592965, "step": 5670}, {"loss": 1.3546, "grad_norm": 0.7773269414901733, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 5680}, {"loss": 1.4031, "grad_norm": 0.7771748900413513, "learning_rate": 0.0002, "epoch": 4.765494137353434, "step": 5690}, {"loss": 1.3724, "grad_norm": 0.7518507242202759, "learning_rate": 0.0002, "epoch": 4.773869346733669, "step": 5700}, {"loss": 1.3247, "grad_norm": 0.7699326276779175, "learning_rate": 0.0002, "epoch": 4.782244556113903, "step": 5710}, {"loss": 1.437, "grad_norm": 0.7001115679740906, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 5720}, {"loss": 1.4257, "grad_norm": 0.7220682501792908, "learning_rate": 0.0002, "epoch": 4.798994974874372, "step": 5730}, {"loss": 1.4174, "grad_norm": 0.7654005289077759, "learning_rate": 0.0002, "epoch": 4.807370184254607, "step": 5740}, {"loss": 1.3792, "grad_norm": 0.8132795095443726, "learning_rate": 0.0002, "epoch": 4.815745393634841, "step": 5750}, {"loss": 1.4007, "grad_norm": 0.7105404138565063, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 5760}, {"loss": 1.4289, "grad_norm": 0.9346209764480591, "learning_rate": 0.0002, "epoch": 4.83249581239531, "step": 5770}, {"loss": 1.4066, "grad_norm": 1.0075623989105225, "learning_rate": 0.0002, "epoch": 4.840871021775545, "step": 5780}, {"loss": 1.4558, "grad_norm": 0.758376955986023, "learning_rate": 0.0002, "epoch": 4.849246231155779, "step": 5790}, {"loss": 1.4117, "grad_norm": 0.854821503162384, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 5800}, {"loss": 1.4014, "grad_norm": 0.8226943016052246, "learning_rate": 0.0002, "epoch": 4.865996649916248, "step": 5810}, {"loss": 1.3963, "grad_norm": 0.7510473728179932, "learning_rate": 0.0002, "epoch": 4.874371859296483, "step": 5820}, {"loss": 1.4463, "grad_norm": 0.7449678182601929, "learning_rate": 0.0002, "epoch": 4.882747068676717, "step": 5830}, {"loss": 1.3691, "grad_norm": 0.7840824723243713, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 5840}, {"loss": 1.3795, "grad_norm": 0.8811169862747192, "learning_rate": 0.0002, "epoch": 4.899497487437186, "step": 5850}, {"loss": 1.3827, "grad_norm": 0.84914630651474, "learning_rate": 0.0002, "epoch": 4.907872696817421, "step": 5860}, {"loss": 1.4549, "grad_norm": 0.7514461874961853, "learning_rate": 0.0002, "epoch": 4.916247906197655, "step": 5870}, {"loss": 1.3633, "grad_norm": 0.7229002118110657, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 5880}, {"loss": 1.4302, "grad_norm": 0.9418245553970337, "learning_rate": 0.0002, "epoch": 4.932998324958124, "step": 5890}, {"loss": 1.4747, "grad_norm": 0.7626827359199524, "learning_rate": 0.0002, "epoch": 4.941373534338359, "step": 5900}, {"loss": 1.4462, "grad_norm": 0.7711105346679688, "learning_rate": 0.0002, "epoch": 4.949748743718593, "step": 5910}, {"loss": 1.4104, "grad_norm": 0.8689648509025574, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 5920}, {"loss": 1.4273, "grad_norm": 0.7873271107673645, "learning_rate": 0.0002, "epoch": 4.966499162479062, "step": 5930}, {"loss": 1.4361, "grad_norm": 0.7637495994567871, "learning_rate": 0.0002, "epoch": 4.974874371859297, "step": 5940}, {"loss": 1.5037, "grad_norm": 0.9907955527305603, "learning_rate": 0.0002, "epoch": 4.983249581239531, "step": 5950}, {"loss": 1.4476, "grad_norm": 0.7827328443527222, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 5960}, {"loss": 1.4252, "grad_norm": 0.818544328212738, "learning_rate": 0.0002, "epoch": 5.0, "step": 5970}]} +{"epoch": 6.0, "step": 7164, "epoch_duration": 1298.7831723690033, "total_accumulated_duration": 7929.984039068222, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}, {"eval_loss": 1.8088148832321167, "eval_runtime": 37.9609, "eval_samples_per_second": 13.567, "eval_steps_per_second": 1.712, "epoch": 1.0, "step": 1194}, {"loss": 1.7492, "grad_norm": 0.4256260097026825, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.35050156712532043, "learning_rate": 0.0002, "epoch": 1.0134003350083751, "step": 1210}, {"loss": 1.7422, "grad_norm": 0.34773948788642883, "learning_rate": 0.0002, "epoch": 1.0217755443886096, "step": 1220}, {"loss": 1.7803, "grad_norm": 0.35487470030784607, "learning_rate": 0.0002, "epoch": 1.0301507537688441, "step": 1230}, {"loss": 1.7095, "grad_norm": 0.37040361762046814, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1240}, {"loss": 1.7663, "grad_norm": 0.33740508556365967, "learning_rate": 0.0002, "epoch": 1.0469011725293131, "step": 1250}, {"loss": 1.7485, "grad_norm": 0.3962724506855011, "learning_rate": 0.0002, "epoch": 1.0552763819095476, "step": 1260}, {"loss": 1.7334, "grad_norm": 0.3129824101924896, "learning_rate": 0.0002, "epoch": 1.0636515912897822, "step": 1270}, {"loss": 1.8068, "grad_norm": 0.3620055019855499, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1280}, {"loss": 1.7823, "grad_norm": 0.3480982184410095, "learning_rate": 0.0002, "epoch": 1.0804020100502512, "step": 1290}, {"loss": 1.7081, "grad_norm": 0.344424843788147, "learning_rate": 0.0002, "epoch": 1.0887772194304857, "step": 1300}, {"loss": 1.7366, "grad_norm": 0.3480122685432434, "learning_rate": 0.0002, "epoch": 1.0971524288107202, "step": 1310}, {"loss": 1.7029, "grad_norm": 0.323662132024765, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1320}, {"loss": 1.7517, "grad_norm": 0.35440102219581604, "learning_rate": 0.0002, "epoch": 1.1139028475711892, "step": 1330}, {"loss": 1.7573, "grad_norm": 0.3342263698577881, "learning_rate": 0.0002, "epoch": 1.1222780569514237, "step": 1340}, {"loss": 1.7134, "grad_norm": 0.35705259442329407, "learning_rate": 0.0002, "epoch": 1.1306532663316582, "step": 1350}, {"loss": 1.64, "grad_norm": 0.38021907210350037, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1360}, {"loss": 1.66, "grad_norm": 0.34918731451034546, "learning_rate": 0.0002, "epoch": 1.1474036850921272, "step": 1370}, {"loss": 1.7628, "grad_norm": 0.371868371963501, "learning_rate": 0.0002, "epoch": 1.1557788944723617, "step": 1380}, {"loss": 1.725, "grad_norm": 0.38413912057876587, "learning_rate": 0.0002, "epoch": 1.1641541038525962, "step": 1390}, {"loss": 1.6948, "grad_norm": 0.3898005187511444, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1400}, {"loss": 1.8105, "grad_norm": 0.3726498484611511, "learning_rate": 0.0002, "epoch": 1.1809045226130652, "step": 1410}, {"loss": 1.7379, "grad_norm": 0.3532905876636505, "learning_rate": 0.0002, "epoch": 1.1892797319932997, "step": 1420}, {"loss": 1.6699, "grad_norm": 0.338127464056015, "learning_rate": 0.0002, "epoch": 1.1976549413735342, "step": 1430}, {"loss": 1.871, "grad_norm": 0.3472749888896942, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1440}, {"loss": 1.7092, "grad_norm": 0.3523476719856262, "learning_rate": 0.0002, "epoch": 1.2144053601340032, "step": 1450}, {"loss": 1.7329, "grad_norm": 0.42986124753952026, "learning_rate": 0.0002, "epoch": 1.2227805695142377, "step": 1460}, {"loss": 1.7459, "grad_norm": 0.38195517659187317, "learning_rate": 0.0002, "epoch": 1.2311557788944723, "step": 1470}, {"loss": 1.7539, "grad_norm": 0.31665122509002686, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1480}, {"loss": 1.7224, "grad_norm": 0.3539541959762573, "learning_rate": 0.0002, "epoch": 1.2479061976549413, "step": 1490}, {"loss": 1.7655, "grad_norm": 0.40162816643714905, "learning_rate": 0.0002, "epoch": 1.2562814070351758, "step": 1500}, {"loss": 1.702, "grad_norm": 0.34727150201797485, "learning_rate": 0.0002, "epoch": 1.2646566164154103, "step": 1510}, {"loss": 1.7804, "grad_norm": 0.3364993929862976, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1520}, {"loss": 1.8063, "grad_norm": 0.323483943939209, "learning_rate": 0.0002, "epoch": 1.2814070351758793, "step": 1530}, {"loss": 1.7622, "grad_norm": 0.4114733934402466, "learning_rate": 0.0002, "epoch": 1.2897822445561138, "step": 1540}, {"loss": 1.6525, "grad_norm": 0.37476620078086853, "learning_rate": 0.0002, "epoch": 1.2981574539363483, "step": 1550}, {"loss": 1.7225, "grad_norm": 0.4216269552707672, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1560}, {"loss": 1.6995, "grad_norm": 0.3204927444458008, "learning_rate": 0.0002, "epoch": 1.3149078726968173, "step": 1570}, {"loss": 1.7132, "grad_norm": 0.36916354298591614, "learning_rate": 0.0002, "epoch": 1.3232830820770518, "step": 1580}, {"loss": 1.7383, "grad_norm": 0.3755691647529602, "learning_rate": 0.0002, "epoch": 1.3316582914572863, "step": 1590}, {"loss": 1.7351, "grad_norm": 0.3688889443874359, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 1600}, {"loss": 1.7664, "grad_norm": 0.34306398034095764, "learning_rate": 0.0002, "epoch": 1.3484087102177553, "step": 1610}, {"loss": 1.6943, "grad_norm": 0.3651525676250458, "learning_rate": 0.0002, "epoch": 1.3567839195979898, "step": 1620}, {"loss": 1.7206, "grad_norm": 0.3461526036262512, "learning_rate": 0.0002, "epoch": 1.3651591289782243, "step": 1630}, {"loss": 1.728, "grad_norm": 0.37959185242652893, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 1640}, {"loss": 1.746, "grad_norm": 0.4005356431007385, "learning_rate": 0.0002, "epoch": 1.3819095477386933, "step": 1650}, {"loss": 1.694, "grad_norm": 0.3537434935569763, "learning_rate": 0.0002, "epoch": 1.3902847571189278, "step": 1660}, {"loss": 1.6679, "grad_norm": 0.38220855593681335, "learning_rate": 0.0002, "epoch": 1.3986599664991624, "step": 1670}, {"loss": 1.7721, "grad_norm": 0.3573434352874756, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 1680}, {"loss": 1.6983, "grad_norm": 0.40028059482574463, "learning_rate": 0.0002, "epoch": 1.4154103852596314, "step": 1690}, {"loss": 1.7049, "grad_norm": 0.3953610360622406, "learning_rate": 0.0002, "epoch": 1.4237855946398659, "step": 1700}, {"loss": 1.7126, "grad_norm": 0.39524543285369873, "learning_rate": 0.0002, "epoch": 1.4321608040201004, "step": 1710}, {"loss": 1.8319, "grad_norm": 0.37721359729766846, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 1720}, {"loss": 1.7387, "grad_norm": 0.4220093786716461, "learning_rate": 0.0002, "epoch": 1.4489112227805694, "step": 1730}, {"loss": 1.7495, "grad_norm": 0.3876369595527649, "learning_rate": 0.0002, "epoch": 1.457286432160804, "step": 1740}, {"loss": 1.6859, "grad_norm": 0.3774619400501251, "learning_rate": 0.0002, "epoch": 1.4656616415410384, "step": 1750}, {"loss": 1.7223, "grad_norm": 0.3608052432537079, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 1760}, {"loss": 1.6746, "grad_norm": 0.32083916664123535, "learning_rate": 0.0002, "epoch": 1.4824120603015074, "step": 1770}, {"loss": 1.716, "grad_norm": 0.32290884852409363, "learning_rate": 0.0002, "epoch": 1.490787269681742, "step": 1780}, {"loss": 1.7648, "grad_norm": 0.3537974953651428, "learning_rate": 0.0002, "epoch": 1.4991624790619764, "step": 1790}, {"loss": 1.6784, "grad_norm": 0.36576104164123535, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 1800}, {"loss": 1.6818, "grad_norm": 0.3336752653121948, "learning_rate": 0.0002, "epoch": 1.5159128978224454, "step": 1810}, {"loss": 1.7425, "grad_norm": 0.3551652431488037, "learning_rate": 0.0002, "epoch": 1.52428810720268, "step": 1820}, {"loss": 1.6997, "grad_norm": 0.43313586711883545, "learning_rate": 0.0002, "epoch": 1.5326633165829144, "step": 1830}, {"loss": 1.7358, "grad_norm": 0.39160311222076416, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 1840}, {"loss": 1.7709, "grad_norm": 0.38758179545402527, "learning_rate": 0.0002, "epoch": 1.5494137353433834, "step": 1850}, {"loss": 1.7768, "grad_norm": 0.3658832013607025, "learning_rate": 0.0002, "epoch": 1.557788944723618, "step": 1860}, {"loss": 1.7486, "grad_norm": 0.375372052192688, "learning_rate": 0.0002, "epoch": 1.5661641541038525, "step": 1870}, {"loss": 1.6555, "grad_norm": 0.3586942255496979, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 1880}, {"loss": 1.7314, "grad_norm": 0.3626467287540436, "learning_rate": 0.0002, "epoch": 1.5829145728643215, "step": 1890}, {"loss": 1.7943, "grad_norm": 0.4199363589286804, "learning_rate": 0.0002, "epoch": 1.591289782244556, "step": 1900}, {"loss": 1.6551, "grad_norm": 0.35646331310272217, "learning_rate": 0.0002, "epoch": 1.5996649916247905, "step": 1910}, {"loss": 1.7125, "grad_norm": 0.3465106189250946, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 1920}, {"loss": 1.8507, "grad_norm": 0.43392884731292725, "learning_rate": 0.0002, "epoch": 1.6164154103852595, "step": 1930}, {"loss": 1.7009, "grad_norm": 0.39187198877334595, "learning_rate": 0.0002, "epoch": 1.624790619765494, "step": 1940}, {"loss": 1.7202, "grad_norm": 0.3685080409049988, "learning_rate": 0.0002, "epoch": 1.6331658291457285, "step": 1950}, {"loss": 1.6607, "grad_norm": 0.4044491946697235, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 1960}, {"loss": 1.7234, "grad_norm": 0.4388049244880676, "learning_rate": 0.0002, "epoch": 1.6499162479061975, "step": 1970}, {"loss": 1.7178, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.658291457286432, "step": 1980}, {"loss": 1.75, "grad_norm": 0.3501148521900177, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1990}, {"loss": 1.7057, "grad_norm": 0.3751881718635559, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2000}, {"loss": 1.7209, "grad_norm": 0.3902788460254669, "learning_rate": 0.0002, "epoch": 1.6834170854271355, "step": 2010}, {"loss": 1.8517, "grad_norm": 0.39642134308815, "learning_rate": 0.0002, "epoch": 1.69179229480737, "step": 2020}, {"loss": 1.6623, "grad_norm": 0.35721203684806824, "learning_rate": 0.0002, "epoch": 1.7001675041876045, "step": 2030}, {"loss": 1.6988, "grad_norm": 0.360419899225235, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2040}, {"loss": 1.691, "grad_norm": 0.3755600154399872, "learning_rate": 0.0002, "epoch": 1.7169179229480735, "step": 2050}, {"loss": 1.6726, "grad_norm": 0.3939184844493866, "learning_rate": 0.0002, "epoch": 1.725293132328308, "step": 2060}, {"loss": 1.7326, "grad_norm": 0.33955490589141846, "learning_rate": 0.0002, "epoch": 1.7336683417085426, "step": 2070}, {"loss": 1.6794, "grad_norm": 0.35501939058303833, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2080}, {"loss": 1.7312, "grad_norm": 0.38298022747039795, "learning_rate": 0.0002, "epoch": 1.7504187604690116, "step": 2090}, {"loss": 1.6602, "grad_norm": 0.3472785949707031, "learning_rate": 0.0002, "epoch": 1.758793969849246, "step": 2100}, {"loss": 1.6671, "grad_norm": 0.3620430827140808, "learning_rate": 0.0002, "epoch": 1.7671691792294806, "step": 2110}, {"loss": 1.671, "grad_norm": 0.3795909881591797, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2120}, {"loss": 1.7193, "grad_norm": 0.3662523925304413, "learning_rate": 0.0002, "epoch": 1.7839195979899496, "step": 2130}, {"loss": 1.7764, "grad_norm": 0.4113886058330536, "learning_rate": 0.0002, "epoch": 1.792294807370184, "step": 2140}, {"loss": 1.6681, "grad_norm": 0.3765672743320465, "learning_rate": 0.0002, "epoch": 1.8006700167504186, "step": 2150}, {"loss": 1.7481, "grad_norm": 0.41623714566230774, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2160}, {"loss": 1.712, "grad_norm": 0.3724099099636078, "learning_rate": 0.0002, "epoch": 1.8174204355108876, "step": 2170}, {"loss": 1.6912, "grad_norm": 0.3990779221057892, "learning_rate": 0.0002, "epoch": 1.8257956448911221, "step": 2180}, {"loss": 1.7361, "grad_norm": 0.3677702844142914, "learning_rate": 0.0002, "epoch": 1.8341708542713566, "step": 2190}, {"loss": 1.6705, "grad_norm": 0.3944959342479706, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2200}, {"loss": 1.7619, "grad_norm": 0.3413957357406616, "learning_rate": 0.0002, "epoch": 1.8509212730318256, "step": 2210}, {"loss": 1.7069, "grad_norm": 0.40136098861694336, "learning_rate": 0.0002, "epoch": 1.8592964824120601, "step": 2220}, {"loss": 1.6865, "grad_norm": 0.3496319055557251, "learning_rate": 0.0002, "epoch": 1.8676716917922946, "step": 2230}, {"loss": 1.6906, "grad_norm": 0.3759860694408417, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2240}, {"loss": 1.8394, "grad_norm": 0.43556007742881775, "learning_rate": 0.0002, "epoch": 1.8844221105527639, "step": 2250}, {"loss": 1.66, "grad_norm": 0.3864828944206238, "learning_rate": 0.0002, "epoch": 1.8927973199329984, "step": 2260}, {"loss": 1.6502, "grad_norm": 0.396930456161499, "learning_rate": 0.0002, "epoch": 1.9011725293132329, "step": 2270}, {"loss": 1.838, "grad_norm": 0.37667879462242126, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2280}, {"loss": 1.7315, "grad_norm": 0.3539164066314697, "learning_rate": 0.0002, "epoch": 1.917922948073702, "step": 2290}, {"loss": 1.7589, "grad_norm": 0.40542101860046387, "learning_rate": 0.0002, "epoch": 1.9262981574539364, "step": 2300}, {"loss": 1.6795, "grad_norm": 0.37341606616973877, "learning_rate": 0.0002, "epoch": 1.934673366834171, "step": 2310}, {"loss": 1.7058, "grad_norm": 0.4011504352092743, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2320}, {"loss": 1.688, "grad_norm": 0.37934592366218567, "learning_rate": 0.0002, "epoch": 1.95142378559464, "step": 2330}, {"loss": 1.6699, "grad_norm": 0.32745009660720825, "learning_rate": 0.0002, "epoch": 1.9597989949748744, "step": 2340}, {"loss": 1.7673, "grad_norm": 0.38347750902175903, "learning_rate": 0.0002, "epoch": 1.968174204355109, "step": 2350}, {"loss": 1.7116, "grad_norm": 0.3945120871067047, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2360}, {"loss": 1.7559, "grad_norm": 0.4034058749675751, "learning_rate": 0.0002, "epoch": 1.984924623115578, "step": 2370}, {"loss": 1.7254, "grad_norm": 0.3546718955039978, "learning_rate": 0.0002, "epoch": 1.9932998324958124, "step": 2380}, {"eval_loss": 1.8061236143112183, "eval_runtime": 38.2113, "eval_samples_per_second": 13.478, "eval_steps_per_second": 1.701, "epoch": 2.0, "step": 2388}, {"loss": 1.7203, "grad_norm": 0.35184019804000854, "learning_rate": 0.0002, "epoch": 2.0016750418760467, "step": 2390}, {"loss": 1.6124, "grad_norm": 0.40416669845581055, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 2400}, {"loss": 1.6092, "grad_norm": 0.3824569880962372, "learning_rate": 0.0002, "epoch": 2.0184254606365157, "step": 2410}, {"loss": 1.641, "grad_norm": 0.42036163806915283, "learning_rate": 0.0002, "epoch": 2.0268006700167502, "step": 2420}, {"loss": 1.6176, "grad_norm": 0.40417996048927307, "learning_rate": 0.0002, "epoch": 2.0351758793969847, "step": 2430}, {"loss": 1.643, "grad_norm": 0.45298922061920166, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 2440}, {"loss": 1.653, "grad_norm": 0.48289841413497925, "learning_rate": 0.0002, "epoch": 2.0519262981574538, "step": 2450}, {"loss": 1.5275, "grad_norm": 0.43702399730682373, "learning_rate": 0.0002, "epoch": 2.0603015075376883, "step": 2460}, {"loss": 1.5825, "grad_norm": 0.49487054347991943, "learning_rate": 0.0002, "epoch": 2.0686767169179228, "step": 2470}, {"loss": 1.6552, "grad_norm": 0.40030500292778015, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 2480}, {"loss": 1.614, "grad_norm": 0.4664880037307739, "learning_rate": 0.0002, "epoch": 2.0854271356783918, "step": 2490}, {"loss": 1.6589, "grad_norm": 0.4111400842666626, "learning_rate": 0.0002, "epoch": 2.0938023450586263, "step": 2500}, {"loss": 1.5788, "grad_norm": 0.4155750572681427, "learning_rate": 0.0002, "epoch": 2.102177554438861, "step": 2510}, {"loss": 1.598, "grad_norm": 0.39257505536079407, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 2520}, {"loss": 1.65, "grad_norm": 0.4156777560710907, "learning_rate": 0.0002, "epoch": 2.11892797319933, "step": 2530}, {"loss": 1.6695, "grad_norm": 0.4025181233882904, "learning_rate": 0.0002, "epoch": 2.1273031825795643, "step": 2540}, {"loss": 1.6471, "grad_norm": 0.42347562313079834, "learning_rate": 0.0002, "epoch": 2.135678391959799, "step": 2550}, {"loss": 1.6014, "grad_norm": 0.47068294882774353, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 2560}, {"loss": 1.6468, "grad_norm": 0.44081777334213257, "learning_rate": 0.0002, "epoch": 2.152428810720268, "step": 2570}, {"loss": 1.641, "grad_norm": 0.44823798537254333, "learning_rate": 0.0002, "epoch": 2.1608040201005023, "step": 2580}, {"loss": 1.6287, "grad_norm": 0.40486326813697815, "learning_rate": 0.0002, "epoch": 2.169179229480737, "step": 2590}, {"loss": 1.6198, "grad_norm": 0.454236775636673, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 2600}, {"loss": 1.5885, "grad_norm": 0.42555344104766846, "learning_rate": 0.0002, "epoch": 2.185929648241206, "step": 2610}, {"loss": 1.6348, "grad_norm": 0.5607381463050842, "learning_rate": 0.0002, "epoch": 2.1943048576214403, "step": 2620}, {"loss": 1.6343, "grad_norm": 0.4095611870288849, "learning_rate": 0.0002, "epoch": 2.202680067001675, "step": 2630}, {"loss": 1.5584, "grad_norm": 0.419342577457428, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 2640}, {"loss": 1.5425, "grad_norm": 0.48541849851608276, "learning_rate": 0.0002, "epoch": 2.219430485762144, "step": 2650}, {"loss": 1.6233, "grad_norm": 0.4365246891975403, "learning_rate": 0.0002, "epoch": 2.2278056951423784, "step": 2660}, {"loss": 1.6886, "grad_norm": 0.46417000889778137, "learning_rate": 0.0002, "epoch": 2.236180904522613, "step": 2670}, {"loss": 1.6345, "grad_norm": 0.5034580230712891, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 2680}, {"loss": 1.5992, "grad_norm": 0.44852879643440247, "learning_rate": 0.0002, "epoch": 2.2529313232830823, "step": 2690}, {"loss": 1.6152, "grad_norm": 0.43886998295783997, "learning_rate": 0.0002, "epoch": 2.2613065326633164, "step": 2700}, {"loss": 1.6533, "grad_norm": 0.45762625336647034, "learning_rate": 0.0002, "epoch": 2.2696817420435513, "step": 2710}, {"loss": 1.5889, "grad_norm": 0.39429017901420593, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 2720}, {"loss": 1.6419, "grad_norm": 0.4420442581176758, "learning_rate": 0.0002, "epoch": 2.2864321608040203, "step": 2730}, {"loss": 1.6126, "grad_norm": 0.4327794015407562, "learning_rate": 0.0002, "epoch": 2.2948073701842544, "step": 2740}, {"loss": 1.6405, "grad_norm": 0.4303780198097229, "learning_rate": 0.0002, "epoch": 2.3031825795644894, "step": 2750}, {"loss": 1.6362, "grad_norm": 0.41379377245903015, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 2760}, {"loss": 1.6744, "grad_norm": 0.4821205735206604, "learning_rate": 0.0002, "epoch": 2.3199329983249584, "step": 2770}, {"loss": 1.6694, "grad_norm": 0.46232181787490845, "learning_rate": 0.0002, "epoch": 2.3283082077051924, "step": 2780}, {"loss": 1.6341, "grad_norm": 0.44937554001808167, "learning_rate": 0.0002, "epoch": 2.3366834170854274, "step": 2790}, {"loss": 1.6556, "grad_norm": 0.443250447511673, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 2800}, {"loss": 1.6874, "grad_norm": 0.4687805473804474, "learning_rate": 0.0002, "epoch": 2.3534338358458964, "step": 2810}, {"loss": 1.6445, "grad_norm": 0.435031920671463, "learning_rate": 0.0002, "epoch": 2.3618090452261304, "step": 2820}, {"loss": 1.6335, "grad_norm": 0.4949858784675598, "learning_rate": 0.0002, "epoch": 2.3701842546063654, "step": 2830}, {"loss": 1.6803, "grad_norm": 0.46349018812179565, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 2840}, {"loss": 1.6586, "grad_norm": 0.46377238631248474, "learning_rate": 0.0002, "epoch": 2.3869346733668344, "step": 2850}, {"loss": 1.5384, "grad_norm": 0.6111940741539001, "learning_rate": 0.0002, "epoch": 2.3953098827470685, "step": 2860}, {"loss": 1.6132, "grad_norm": 0.45090532302856445, "learning_rate": 0.0002, "epoch": 2.4036850921273034, "step": 2870}, {"loss": 1.6047, "grad_norm": 0.4762120842933655, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 2880}, {"loss": 1.6997, "grad_norm": 0.4397919774055481, "learning_rate": 0.0002, "epoch": 2.4204355108877724, "step": 2890}, {"loss": 1.6369, "grad_norm": 0.4765152335166931, "learning_rate": 0.0002, "epoch": 2.4288107202680065, "step": 2900}, {"loss": 1.5982, "grad_norm": 0.4347304403781891, "learning_rate": 0.0002, "epoch": 2.4371859296482414, "step": 2910}, {"loss": 1.6409, "grad_norm": 0.3918324410915375, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 2920}, {"loss": 1.5354, "grad_norm": 0.43932855129241943, "learning_rate": 0.0002, "epoch": 2.4539363484087104, "step": 2930}, {"loss": 1.6283, "grad_norm": 0.46946918964385986, "learning_rate": 0.0002, "epoch": 2.4623115577889445, "step": 2940}, {"loss": 1.6622, "grad_norm": 0.45169174671173096, "learning_rate": 0.0002, "epoch": 2.4706867671691795, "step": 2950}, {"loss": 1.6386, "grad_norm": 0.43488186597824097, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 2960}, {"loss": 1.6187, "grad_norm": 0.42297765612602234, "learning_rate": 0.0002, "epoch": 2.4874371859296485, "step": 2970}, {"loss": 1.5708, "grad_norm": 0.4546392560005188, "learning_rate": 0.0002, "epoch": 2.4958123953098825, "step": 2980}, {"loss": 1.5944, "grad_norm": 0.4236692488193512, "learning_rate": 0.0002, "epoch": 2.5041876046901175, "step": 2990}, {"loss": 1.6927, "grad_norm": 0.46421024203300476, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3000}, {"loss": 1.6686, "grad_norm": 0.5040220618247986, "learning_rate": 0.0002, "epoch": 2.5209380234505865, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4596138894557953, "learning_rate": 0.0002, "epoch": 2.5293132328308205, "step": 3020}, {"loss": 1.5936, "grad_norm": 0.4410228729248047, "learning_rate": 0.0002, "epoch": 2.5376884422110555, "step": 3030}, {"loss": 1.6336, "grad_norm": 0.553693413734436, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3040}, {"loss": 1.6377, "grad_norm": 0.41298043727874756, "learning_rate": 0.0002, "epoch": 2.5544388609715245, "step": 3050}, {"loss": 1.7196, "grad_norm": 0.4894513487815857, "learning_rate": 0.0002, "epoch": 2.5628140703517586, "step": 3060}, {"loss": 1.6106, "grad_norm": 0.5525603294372559, "learning_rate": 0.0002, "epoch": 2.5711892797319935, "step": 3070}, {"loss": 1.6089, "grad_norm": 0.5043630003929138, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3080}, {"loss": 1.5641, "grad_norm": 0.4690920412540436, "learning_rate": 0.0002, "epoch": 2.5879396984924625, "step": 3090}, {"loss": 1.6364, "grad_norm": 0.4358677566051483, "learning_rate": 0.0002, "epoch": 2.5963149078726966, "step": 3100}, {"loss": 1.6328, "grad_norm": 0.4621894061565399, "learning_rate": 0.0002, "epoch": 2.6046901172529315, "step": 3110}, {"loss": 1.7426, "grad_norm": 0.4639507532119751, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3120}, {"loss": 1.6492, "grad_norm": 0.45161309838294983, "learning_rate": 0.0002, "epoch": 2.6214405360134005, "step": 3130}, {"loss": 1.6221, "grad_norm": 0.49179261922836304, "learning_rate": 0.0002, "epoch": 2.6298157453936346, "step": 3140}, {"loss": 1.663, "grad_norm": 0.4739720821380615, "learning_rate": 0.0002, "epoch": 2.6381909547738696, "step": 3150}, {"loss": 1.616, "grad_norm": 0.468252956867218, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3160}, {"loss": 1.705, "grad_norm": 0.44691553711891174, "learning_rate": 0.0002, "epoch": 2.6549413735343386, "step": 3170}, {"loss": 1.6558, "grad_norm": 0.47537046670913696, "learning_rate": 0.0002, "epoch": 2.6633165829145726, "step": 3180}, {"loss": 1.6755, "grad_norm": 0.4445202052593231, "learning_rate": 0.0002, "epoch": 2.6716917922948076, "step": 3190}, {"loss": 1.6522, "grad_norm": 0.46785518527030945, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 3200}, {"loss": 1.6711, "grad_norm": 0.4807088077068329, "learning_rate": 0.0002, "epoch": 2.6884422110552766, "step": 3210}, {"loss": 1.6385, "grad_norm": 0.4547516703605652, "learning_rate": 0.0002, "epoch": 2.6968174204355106, "step": 3220}, {"loss": 1.6084, "grad_norm": 0.5200821161270142, "learning_rate": 0.0002, "epoch": 2.7051926298157456, "step": 3230}, {"loss": 1.6434, "grad_norm": 0.4915551245212555, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 3240}, {"loss": 1.6146, "grad_norm": 0.4324817955493927, "learning_rate": 0.0002, "epoch": 2.7219430485762146, "step": 3250}, {"loss": 1.6154, "grad_norm": 0.6290464997291565, "learning_rate": 0.0002, "epoch": 2.7303182579564487, "step": 3260}, {"loss": 1.611, "grad_norm": 0.42255541682243347, "learning_rate": 0.0002, "epoch": 2.7386934673366836, "step": 3270}, {"loss": 1.6345, "grad_norm": 0.47089505195617676, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 3280}, {"loss": 1.6357, "grad_norm": 0.4492960572242737, "learning_rate": 0.0002, "epoch": 2.7554438860971526, "step": 3290}, {"loss": 1.652, "grad_norm": 0.4711938202381134, "learning_rate": 0.0002, "epoch": 2.7638190954773867, "step": 3300}, {"loss": 1.6107, "grad_norm": 0.4635316729545593, "learning_rate": 0.0002, "epoch": 2.7721943048576216, "step": 3310}, {"loss": 1.6044, "grad_norm": 0.4207742512226105, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 3320}, {"loss": 1.6163, "grad_norm": 0.5545504093170166, "learning_rate": 0.0002, "epoch": 2.7889447236180906, "step": 3330}, {"loss": 1.6642, "grad_norm": 0.46976953744888306, "learning_rate": 0.0002, "epoch": 2.7973199329983247, "step": 3340}, {"loss": 1.6879, "grad_norm": 0.4805937111377716, "learning_rate": 0.0002, "epoch": 2.8056951423785597, "step": 3350}, {"loss": 1.6185, "grad_norm": 0.4986467659473419, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 3360}, {"loss": 1.6125, "grad_norm": 0.44702932238578796, "learning_rate": 0.0002, "epoch": 2.8224455611390287, "step": 3370}, {"loss": 1.6318, "grad_norm": 0.4698854088783264, "learning_rate": 0.0002, "epoch": 2.8308207705192627, "step": 3380}, {"loss": 1.6468, "grad_norm": 0.5756528377532959, "learning_rate": 0.0002, "epoch": 2.8391959798994977, "step": 3390}, {"loss": 1.6783, "grad_norm": 0.4266531765460968, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 3400}, {"loss": 1.6351, "grad_norm": 0.5342442989349365, "learning_rate": 0.0002, "epoch": 2.8559463986599667, "step": 3410}, {"loss": 1.659, "grad_norm": 0.47210443019866943, "learning_rate": 0.0002, "epoch": 2.8643216080402008, "step": 3420}, {"loss": 1.6157, "grad_norm": 0.4491795599460602, "learning_rate": 0.0002, "epoch": 2.8726968174204357, "step": 3430}, {"loss": 1.6179, "grad_norm": 0.5387647151947021, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 3440}, {"loss": 1.6415, "grad_norm": 0.5059208273887634, "learning_rate": 0.0002, "epoch": 2.8894472361809047, "step": 3450}, {"loss": 1.6577, "grad_norm": 0.472605437040329, "learning_rate": 0.0002, "epoch": 2.8978224455611388, "step": 3460}, {"loss": 1.6831, "grad_norm": 0.499795138835907, "learning_rate": 0.0002, "epoch": 2.9061976549413737, "step": 3470}, {"loss": 1.6198, "grad_norm": 0.4887969493865967, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4670022130012512, "learning_rate": 0.0002, "epoch": 2.9229480737018427, "step": 3490}, {"loss": 1.6355, "grad_norm": 0.4475444555282593, "learning_rate": 0.0002, "epoch": 2.931323283082077, "step": 3500}, {"loss": 1.6669, "grad_norm": 0.39244669675827026, "learning_rate": 0.0002, "epoch": 2.9396984924623117, "step": 3510}, {"loss": 1.6094, "grad_norm": 0.4905056059360504, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 3520}, {"loss": 1.5774, "grad_norm": 0.4395551085472107, "learning_rate": 0.0002, "epoch": 2.9564489112227808, "step": 3530}, {"loss": 1.6047, "grad_norm": 0.4693661034107208, "learning_rate": 0.0002, "epoch": 2.964824120603015, "step": 3540}, {"loss": 1.648, "grad_norm": 0.473781943321228, "learning_rate": 0.0002, "epoch": 2.9731993299832498, "step": 3550}, {"loss": 1.7056, "grad_norm": 0.4374050796031952, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 3560}, {"loss": 1.6816, "grad_norm": 0.46144190430641174, "learning_rate": 0.0002, "epoch": 2.9899497487437188, "step": 3570}, {"loss": 1.5454, "grad_norm": 0.43887680768966675, "learning_rate": 0.0002, "epoch": 2.998324958123953, "step": 3580}, {"eval_loss": 1.8283122777938843, "eval_runtime": 38.023, "eval_samples_per_second": 13.544, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 3582}, {"loss": 1.5874, "grad_norm": 0.6784713268280029, "learning_rate": 0.0002, "epoch": 3.006700167504188, "step": 3590}, {"loss": 1.5813, "grad_norm": 0.5783940553665161, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 3600}, {"loss": 1.4769, "grad_norm": 0.5408937335014343, "learning_rate": 0.0002, "epoch": 3.023450586264657, "step": 3610}, {"loss": 1.526, "grad_norm": 0.5229013562202454, "learning_rate": 0.0002, "epoch": 3.0318257956448913, "step": 3620}, {"loss": 1.4835, "grad_norm": 0.49160143733024597, "learning_rate": 0.0002, "epoch": 3.040201005025126, "step": 3630}, {"loss": 1.5398, "grad_norm": 0.6563201546669006, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 3640}, {"loss": 1.448, "grad_norm": 0.5686020851135254, "learning_rate": 0.0002, "epoch": 3.056951423785595, "step": 3650}, {"loss": 1.4541, "grad_norm": 0.5774043202400208, "learning_rate": 0.0002, "epoch": 3.0653266331658293, "step": 3660}, {"loss": 1.4734, "grad_norm": 0.6106171011924744, "learning_rate": 0.0002, "epoch": 3.073701842546064, "step": 3670}, {"loss": 1.4961, "grad_norm": 0.517433226108551, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 3680}, {"loss": 1.4961, "grad_norm": 0.5681702494621277, "learning_rate": 0.0002, "epoch": 3.090452261306533, "step": 3690}, {"loss": 1.4731, "grad_norm": 0.5769233107566833, "learning_rate": 0.0002, "epoch": 3.0988274706867673, "step": 3700}, {"loss": 1.4836, "grad_norm": 0.5657462477684021, "learning_rate": 0.0002, "epoch": 3.107202680067002, "step": 3710}, {"loss": 1.4526, "grad_norm": 0.6035246253013611, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 3720}, {"loss": 1.5102, "grad_norm": 0.7286643385887146, "learning_rate": 0.0002, "epoch": 3.123953098827471, "step": 3730}, {"loss": 1.4444, "grad_norm": 0.5121201872825623, "learning_rate": 0.0002, "epoch": 3.1323283082077054, "step": 3740}, {"loss": 1.565, "grad_norm": 0.5074213147163391, "learning_rate": 0.0002, "epoch": 3.14070351758794, "step": 3750}, {"loss": 1.4729, "grad_norm": 0.57481849193573, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 3760}, {"loss": 1.4765, "grad_norm": 0.6326663494110107, "learning_rate": 0.0002, "epoch": 3.157453936348409, "step": 3770}, {"loss": 1.4888, "grad_norm": 0.6039315462112427, "learning_rate": 0.0002, "epoch": 3.1658291457286434, "step": 3780}, {"loss": 1.5084, "grad_norm": 0.6936715245246887, "learning_rate": 0.0002, "epoch": 3.174204355108878, "step": 3790}, {"loss": 1.4879, "grad_norm": 0.6516796946525574, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 3800}, {"loss": 1.578, "grad_norm": 0.6140730977058411, "learning_rate": 0.0002, "epoch": 3.190954773869347, "step": 3810}, {"loss": 1.5101, "grad_norm": 0.631328284740448, "learning_rate": 0.0002, "epoch": 3.1993299832495814, "step": 3820}, {"loss": 1.4844, "grad_norm": 0.6265402436256409, "learning_rate": 0.0002, "epoch": 3.207705192629816, "step": 3830}, {"loss": 1.5332, "grad_norm": 0.6649428606033325, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 3840}, {"loss": 1.5231, "grad_norm": 0.5329259634017944, "learning_rate": 0.0002, "epoch": 3.224455611390285, "step": 3850}, {"loss": 1.5714, "grad_norm": 0.6008304953575134, "learning_rate": 0.0002, "epoch": 3.2328308207705194, "step": 3860}, {"loss": 1.5214, "grad_norm": 0.5918582081794739, "learning_rate": 0.0002, "epoch": 3.241206030150754, "step": 3870}, {"loss": 1.571, "grad_norm": 0.643622100353241, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 3880}, {"loss": 1.5274, "grad_norm": 0.5517964363098145, "learning_rate": 0.0002, "epoch": 3.257956448911223, "step": 3890}, {"loss": 1.5458, "grad_norm": 0.6780755519866943, "learning_rate": 0.0002, "epoch": 3.2663316582914574, "step": 3900}, {"loss": 1.5743, "grad_norm": 0.6742202639579773, "learning_rate": 0.0002, "epoch": 3.274706867671692, "step": 3910}, {"loss": 1.5279, "grad_norm": 0.6228749752044678, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 3920}, {"loss": 1.4899, "grad_norm": 0.5836303234100342, "learning_rate": 0.0002, "epoch": 3.291457286432161, "step": 3930}, {"loss": 1.5445, "grad_norm": 0.6337724328041077, "learning_rate": 0.0002, "epoch": 3.2998324958123955, "step": 3940}, {"loss": 1.5618, "grad_norm": 0.6345084309577942, "learning_rate": 0.0002, "epoch": 3.30820770519263, "step": 3950}, {"loss": 1.4224, "grad_norm": 0.6125303506851196, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 3960}, {"loss": 1.5355, "grad_norm": 0.6259911060333252, "learning_rate": 0.0002, "epoch": 3.324958123953099, "step": 3970}, {"loss": 1.5427, "grad_norm": 0.645745575428009, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 3980}, {"loss": 1.5817, "grad_norm": 0.6666176915168762, "learning_rate": 0.0002, "epoch": 3.341708542713568, "step": 3990}, {"loss": 1.4998, "grad_norm": 0.59013831615448, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 4000}, {"loss": 1.4921, "grad_norm": 0.6604634523391724, "learning_rate": 0.0002, "epoch": 3.358458961474037, "step": 4010}, {"loss": 1.5076, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 3.3668341708542715, "step": 4020}, {"loss": 1.4801, "grad_norm": 0.515724778175354, "learning_rate": 0.0002, "epoch": 3.375209380234506, "step": 4030}, {"loss": 1.4932, "grad_norm": 0.681968092918396, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 4040}, {"loss": 1.5148, "grad_norm": 0.5978158116340637, "learning_rate": 0.0002, "epoch": 3.391959798994975, "step": 4050}, {"loss": 1.5449, "grad_norm": 0.6043432354927063, "learning_rate": 0.0002, "epoch": 3.4003350083752095, "step": 4060}, {"loss": 1.5021, "grad_norm": 0.5899770855903625, "learning_rate": 0.0002, "epoch": 3.408710217755444, "step": 4070}, {"loss": 1.5992, "grad_norm": 0.6014242172241211, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 4080}, {"loss": 1.4692, "grad_norm": 0.5944811105728149, "learning_rate": 0.0002, "epoch": 3.425460636515913, "step": 4090}, {"loss": 1.5877, "grad_norm": 0.6506822109222412, "learning_rate": 0.0002, "epoch": 3.4338358458961475, "step": 4100}, {"loss": 1.5144, "grad_norm": 0.6926528811454773, "learning_rate": 0.0002, "epoch": 3.442211055276382, "step": 4110}, {"loss": 1.5169, "grad_norm": 0.5646378993988037, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 4120}, {"loss": 1.5032, "grad_norm": 0.7233654856681824, "learning_rate": 0.0002, "epoch": 3.458961474036851, "step": 4130}, {"loss": 1.5161, "grad_norm": 0.6231815814971924, "learning_rate": 0.0002, "epoch": 3.4673366834170856, "step": 4140}, {"loss": 1.5349, "grad_norm": 0.6115689873695374, "learning_rate": 0.0002, "epoch": 3.47571189279732, "step": 4150}, {"loss": 1.4621, "grad_norm": 0.5812674760818481, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 4160}, {"loss": 1.5465, "grad_norm": 0.6099632978439331, "learning_rate": 0.0002, "epoch": 3.492462311557789, "step": 4170}, {"loss": 1.4795, "grad_norm": 0.6102647185325623, "learning_rate": 0.0002, "epoch": 3.5008375209380236, "step": 4180}, {"loss": 1.5305, "grad_norm": 0.6034680008888245, "learning_rate": 0.0002, "epoch": 3.509212730318258, "step": 4190}, {"loss": 1.5093, "grad_norm": 0.6281666159629822, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 4200}, {"loss": 1.4903, "grad_norm": 0.6245372295379639, "learning_rate": 0.0002, "epoch": 3.525963149078727, "step": 4210}, {"loss": 1.5098, "grad_norm": 0.5897293090820312, "learning_rate": 0.0002, "epoch": 3.5343383584589616, "step": 4220}, {"loss": 1.5991, "grad_norm": 0.601054847240448, "learning_rate": 0.0002, "epoch": 3.542713567839196, "step": 4230}, {"loss": 1.4974, "grad_norm": 0.7004473805427551, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 4240}, {"loss": 1.5993, "grad_norm": 0.6601553559303284, "learning_rate": 0.0002, "epoch": 3.559463986599665, "step": 4250}, {"loss": 1.4961, "grad_norm": 0.6112467050552368, "learning_rate": 0.0002, "epoch": 3.5678391959798996, "step": 4260}, {"loss": 1.4967, "grad_norm": 0.5902454853057861, "learning_rate": 0.0002, "epoch": 3.576214405360134, "step": 4270}, {"loss": 1.5659, "grad_norm": 0.5792450904846191, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 4280}, {"loss": 1.4664, "grad_norm": 0.5923888087272644, "learning_rate": 0.0002, "epoch": 3.592964824120603, "step": 4290}, {"loss": 1.5155, "grad_norm": 0.5869482159614563, "learning_rate": 0.0002, "epoch": 3.6013400335008376, "step": 4300}, {"loss": 1.5119, "grad_norm": 0.6372929811477661, "learning_rate": 0.0002, "epoch": 3.609715242881072, "step": 4310}, {"loss": 1.4977, "grad_norm": 0.6350686550140381, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 4320}, {"loss": 1.5226, "grad_norm": 0.571819007396698, "learning_rate": 0.0002, "epoch": 3.626465661641541, "step": 4330}, {"loss": 1.5414, "grad_norm": 0.592250645160675, "learning_rate": 0.0002, "epoch": 3.6348408710217757, "step": 4340}, {"loss": 1.4912, "grad_norm": 0.6110650897026062, "learning_rate": 0.0002, "epoch": 3.64321608040201, "step": 4350}, {"loss": 1.6089, "grad_norm": 0.6187081336975098, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 4360}, {"loss": 1.5345, "grad_norm": 0.6197671890258789, "learning_rate": 0.0002, "epoch": 3.659966499162479, "step": 4370}, {"loss": 1.4988, "grad_norm": 0.6050862669944763, "learning_rate": 0.0002, "epoch": 3.6683417085427137, "step": 4380}, {"loss": 1.4872, "grad_norm": 0.621265172958374, "learning_rate": 0.0002, "epoch": 3.676716917922948, "step": 4390}, {"loss": 1.6011, "grad_norm": 0.6552940011024475, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 4400}, {"loss": 1.4344, "grad_norm": 0.5638861060142517, "learning_rate": 0.0002, "epoch": 3.693467336683417, "step": 4410}, {"loss": 1.4985, "grad_norm": 0.6388863325119019, "learning_rate": 0.0002, "epoch": 3.7018425460636517, "step": 4420}, {"loss": 1.3696, "grad_norm": 0.6062559485435486, "learning_rate": 0.0002, "epoch": 3.710217755443886, "step": 4430}, {"loss": 1.5101, "grad_norm": 0.5800350308418274, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 4440}, {"loss": 1.5286, "grad_norm": 0.5954474210739136, "learning_rate": 0.0002, "epoch": 3.726968174204355, "step": 4450}, {"loss": 1.6133, "grad_norm": 0.5880125761032104, "learning_rate": 0.0002, "epoch": 3.7353433835845897, "step": 4460}, {"loss": 1.5055, "grad_norm": 0.5880921483039856, "learning_rate": 0.0002, "epoch": 3.7437185929648242, "step": 4470}, {"loss": 1.5728, "grad_norm": 0.5995073914527893, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 4480}, {"loss": 1.554, "grad_norm": 0.5958493947982788, "learning_rate": 0.0002, "epoch": 3.7604690117252932, "step": 4490}, {"loss": 1.5472, "grad_norm": 0.5694711804389954, "learning_rate": 0.0002, "epoch": 3.7688442211055277, "step": 4500}, {"loss": 1.5105, "grad_norm": 0.6175141930580139, "learning_rate": 0.0002, "epoch": 3.7772194304857623, "step": 4510}, {"loss": 1.5404, "grad_norm": 0.5541581511497498, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 4520}, {"loss": 1.5283, "grad_norm": 0.5986164808273315, "learning_rate": 0.0002, "epoch": 3.7939698492462313, "step": 4530}, {"loss": 1.4961, "grad_norm": 0.640072226524353, "learning_rate": 0.0002, "epoch": 3.8023450586264658, "step": 4540}, {"loss": 1.5297, "grad_norm": 0.5742579698562622, "learning_rate": 0.0002, "epoch": 3.8107202680067003, "step": 4550}, {"loss": 1.5591, "grad_norm": 0.6658656001091003, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 4560}, {"loss": 1.4992, "grad_norm": 0.5475369691848755, "learning_rate": 0.0002, "epoch": 3.8274706867671693, "step": 4570}, {"loss": 1.5966, "grad_norm": 0.613172173500061, "learning_rate": 0.0002, "epoch": 3.835845896147404, "step": 4580}, {"loss": 1.5594, "grad_norm": 0.590968132019043, "learning_rate": 0.0002, "epoch": 3.8442211055276383, "step": 4590}, {"loss": 1.5067, "grad_norm": 0.5865461826324463, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 4600}, {"loss": 1.5247, "grad_norm": 0.6815178990364075, "learning_rate": 0.0002, "epoch": 3.8609715242881073, "step": 4610}, {"loss": 1.5702, "grad_norm": 0.6551400423049927, "learning_rate": 0.0002, "epoch": 3.869346733668342, "step": 4620}, {"loss": 1.4891, "grad_norm": 0.6398897171020508, "learning_rate": 0.0002, "epoch": 3.8777219430485763, "step": 4630}, {"loss": 1.5353, "grad_norm": 0.6761762499809265, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 4640}, {"loss": 1.6071, "grad_norm": 0.6277294754981995, "learning_rate": 0.0002, "epoch": 3.8944723618090453, "step": 4650}, {"loss": 1.5605, "grad_norm": 0.6285301446914673, "learning_rate": 0.0002, "epoch": 3.90284757118928, "step": 4660}, {"loss": 1.5937, "grad_norm": 0.5416069626808167, "learning_rate": 0.0002, "epoch": 3.9112227805695143, "step": 4670}, {"loss": 1.5461, "grad_norm": 0.6314545273780823, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 4680}, {"loss": 1.4828, "grad_norm": 0.604479968547821, "learning_rate": 0.0002, "epoch": 3.9279731993299833, "step": 4690}, {"loss": 1.5186, "grad_norm": 0.5321660041809082, "learning_rate": 0.0002, "epoch": 3.936348408710218, "step": 4700}, {"loss": 1.4696, "grad_norm": 0.6632516980171204, "learning_rate": 0.0002, "epoch": 3.9447236180904524, "step": 4710}, {"loss": 1.519, "grad_norm": 0.5925896763801575, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 4720}, {"loss": 1.5716, "grad_norm": 0.6580308675765991, "learning_rate": 0.0002, "epoch": 3.9614740368509214, "step": 4730}, {"loss": 1.4462, "grad_norm": 0.5578170418739319, "learning_rate": 0.0002, "epoch": 3.969849246231156, "step": 4740}, {"loss": 1.5394, "grad_norm": 0.6216608285903931, "learning_rate": 0.0002, "epoch": 3.9782244556113904, "step": 4750}, {"loss": 1.5395, "grad_norm": 0.5693069696426392, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 4760}, {"loss": 1.5517, "grad_norm": 0.5353434681892395, "learning_rate": 0.0002, "epoch": 3.9949748743718594, "step": 4770}, {"eval_loss": 1.8809821605682373, "eval_runtime": 37.9695, "eval_samples_per_second": 13.564, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4776}, {"loss": 1.4608, "grad_norm": 0.6117817759513855, "learning_rate": 0.0002, "epoch": 4.0033500837520934, "step": 4780}, {"loss": 1.2982, "grad_norm": 0.6816073656082153, "learning_rate": 0.0002, "epoch": 4.011725293132328, "step": 4790}, {"loss": 1.3464, "grad_norm": 0.715548038482666, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 4800}, {"loss": 1.3918, "grad_norm": 0.8585814833641052, "learning_rate": 0.0002, "epoch": 4.028475711892797, "step": 4810}, {"loss": 1.4137, "grad_norm": 0.7372158765792847, "learning_rate": 0.0002, "epoch": 4.0368509212730315, "step": 4820}, {"loss": 1.3769, "grad_norm": 0.8915117979049683, "learning_rate": 0.0002, "epoch": 4.045226130653266, "step": 4830}, {"loss": 1.3551, "grad_norm": 0.9323588013648987, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 4840}, {"loss": 1.3687, "grad_norm": 0.9298437237739563, "learning_rate": 0.0002, "epoch": 4.061976549413735, "step": 4850}, {"loss": 1.4173, "grad_norm": 0.8541792035102844, "learning_rate": 0.0002, "epoch": 4.0703517587939695, "step": 4860}, {"loss": 1.3668, "grad_norm": 0.7833571434020996, "learning_rate": 0.0002, "epoch": 4.078726968174204, "step": 4870}, {"loss": 1.3835, "grad_norm": 0.9325295090675354, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 4880}, {"loss": 1.3834, "grad_norm": 0.7066370248794556, "learning_rate": 0.0002, "epoch": 4.0954773869346734, "step": 4890}, {"loss": 1.3661, "grad_norm": 0.712640643119812, "learning_rate": 0.0002, "epoch": 4.1038525963149075, "step": 4900}, {"loss": 1.3637, "grad_norm": 0.6970218420028687, "learning_rate": 0.0002, "epoch": 4.1122278056951425, "step": 4910}, {"loss": 1.3805, "grad_norm": 0.7979312539100647, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 4920}, {"loss": 1.4115, "grad_norm": 0.7801558375358582, "learning_rate": 0.0002, "epoch": 4.1289782244556115, "step": 4930}, {"loss": 1.3288, "grad_norm": 0.7505159974098206, "learning_rate": 0.0002, "epoch": 4.1373534338358455, "step": 4940}, {"loss": 1.3453, "grad_norm": 0.738201916217804, "learning_rate": 0.0002, "epoch": 4.1457286432160805, "step": 4950}, {"loss": 1.3418, "grad_norm": 0.7736659049987793, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 4960}, {"loss": 1.3663, "grad_norm": 0.7850064635276794, "learning_rate": 0.0002, "epoch": 4.1624790619765495, "step": 4970}, {"loss": 1.326, "grad_norm": 0.8316620588302612, "learning_rate": 0.0002, "epoch": 4.1708542713567835, "step": 4980}, {"loss": 1.377, "grad_norm": 0.7217330932617188, "learning_rate": 0.0002, "epoch": 4.1792294807370185, "step": 4990}, {"loss": 1.3299, "grad_norm": 0.7050199508666992, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 5000}, {"loss": 1.3798, "grad_norm": 0.6992659568786621, "learning_rate": 0.0002, "epoch": 4.1959798994974875, "step": 5010}, {"loss": 1.3391, "grad_norm": 0.7648445963859558, "learning_rate": 0.0002, "epoch": 4.204355108877722, "step": 5020}, {"loss": 1.3339, "grad_norm": 0.8093137741088867, "learning_rate": 0.0002, "epoch": 4.2127303182579565, "step": 5030}, {"loss": 1.37, "grad_norm": 0.6907750368118286, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 5040}, {"loss": 1.4231, "grad_norm": 0.7000078558921814, "learning_rate": 0.0002, "epoch": 4.2294807370184255, "step": 5050}, {"loss": 1.3411, "grad_norm": 0.715034008026123, "learning_rate": 0.0002, "epoch": 4.23785594639866, "step": 5060}, {"loss": 1.3795, "grad_norm": 0.828895628452301, "learning_rate": 0.0002, "epoch": 4.2462311557788945, "step": 5070}, {"loss": 1.3397, "grad_norm": 0.7127292156219482, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 5080}, {"loss": 1.4255, "grad_norm": 0.8256623148918152, "learning_rate": 0.0002, "epoch": 4.2629815745393635, "step": 5090}, {"loss": 1.4078, "grad_norm": 0.8062452077865601, "learning_rate": 0.0002, "epoch": 4.271356783919598, "step": 5100}, {"loss": 1.3705, "grad_norm": 0.6861081123352051, "learning_rate": 0.0002, "epoch": 4.279731993299833, "step": 5110}, {"loss": 1.3463, "grad_norm": 0.7566041350364685, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 5120}, {"loss": 1.4571, "grad_norm": 0.8734753727912903, "learning_rate": 0.0002, "epoch": 4.296482412060302, "step": 5130}, {"loss": 1.4747, "grad_norm": 0.8559320569038391, "learning_rate": 0.0002, "epoch": 4.304857621440536, "step": 5140}, {"loss": 1.3551, "grad_norm": 0.6965576410293579, "learning_rate": 0.0002, "epoch": 4.313232830820771, "step": 5150}, {"loss": 1.3485, "grad_norm": 0.8277813792228699, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 5160}, {"loss": 1.3433, "grad_norm": 1.0733633041381836, "learning_rate": 0.0002, "epoch": 4.32998324958124, "step": 5170}, {"loss": 1.3953, "grad_norm": 0.7914809584617615, "learning_rate": 0.0002, "epoch": 4.338358458961474, "step": 5180}, {"loss": 1.3907, "grad_norm": 0.8307849168777466, "learning_rate": 0.0002, "epoch": 4.346733668341709, "step": 5190}, {"loss": 1.4318, "grad_norm": 0.7066516280174255, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 5200}, {"loss": 1.3866, "grad_norm": 0.9676792025566101, "learning_rate": 0.0002, "epoch": 4.363484087102178, "step": 5210}, {"loss": 1.3973, "grad_norm": 0.7672301530838013, "learning_rate": 0.0002, "epoch": 4.371859296482412, "step": 5220}, {"loss": 1.3576, "grad_norm": 0.6888260245323181, "learning_rate": 0.0002, "epoch": 4.380234505862647, "step": 5230}, {"loss": 1.3815, "grad_norm": 0.8775295615196228, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 5240}, {"loss": 1.3224, "grad_norm": 0.8742642998695374, "learning_rate": 0.0002, "epoch": 4.396984924623116, "step": 5250}, {"loss": 1.4609, "grad_norm": 0.6935433745384216, "learning_rate": 0.0002, "epoch": 4.40536013400335, "step": 5260}, {"loss": 1.3605, "grad_norm": 0.7726178169250488, "learning_rate": 0.0002, "epoch": 4.413735343383585, "step": 5270}, {"loss": 1.4591, "grad_norm": 0.7493860721588135, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 5280}, {"loss": 1.3277, "grad_norm": 0.7758517265319824, "learning_rate": 0.0002, "epoch": 4.430485762144054, "step": 5290}, {"loss": 1.2916, "grad_norm": 0.779315173625946, "learning_rate": 0.0002, "epoch": 4.438860971524288, "step": 5300}, {"loss": 1.4483, "grad_norm": 0.7753667235374451, "learning_rate": 0.0002, "epoch": 4.447236180904523, "step": 5310}, {"loss": 1.2513, "grad_norm": 0.8738188743591309, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 5320}, {"loss": 1.41, "grad_norm": 0.8410757184028625, "learning_rate": 0.0002, "epoch": 4.463986599664992, "step": 5330}, {"loss": 1.3809, "grad_norm": 0.728897750377655, "learning_rate": 0.0002, "epoch": 4.472361809045226, "step": 5340}, {"loss": 1.4049, "grad_norm": 0.7880531549453735, "learning_rate": 0.0002, "epoch": 4.480737018425461, "step": 5350}, {"loss": 1.4106, "grad_norm": 0.8455142378807068, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 5360}, {"loss": 1.431, "grad_norm": 0.8527868986129761, "learning_rate": 0.0002, "epoch": 4.49748743718593, "step": 5370}, {"loss": 1.3586, "grad_norm": 0.7743009328842163, "learning_rate": 0.0002, "epoch": 4.505862646566165, "step": 5380}, {"loss": 1.4175, "grad_norm": 0.7555320858955383, "learning_rate": 0.0002, "epoch": 4.514237855946399, "step": 5390}, {"loss": 1.3433, "grad_norm": 0.8146619200706482, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 5400}, {"loss": 1.4859, "grad_norm": 0.8042502999305725, "learning_rate": 0.0002, "epoch": 4.530988274706868, "step": 5410}, {"loss": 1.3843, "grad_norm": 0.7329140305519104, "learning_rate": 0.0002, "epoch": 4.539363484087103, "step": 5420}, {"loss": 1.3946, "grad_norm": 0.7574753165245056, "learning_rate": 0.0002, "epoch": 4.547738693467337, "step": 5430}, {"loss": 1.3048, "grad_norm": 1.1223409175872803, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 5440}, {"loss": 1.4067, "grad_norm": 0.7647369503974915, "learning_rate": 0.0002, "epoch": 4.564489112227806, "step": 5450}, {"loss": 1.4569, "grad_norm": 0.9135531187057495, "learning_rate": 0.0002, "epoch": 4.572864321608041, "step": 5460}, {"loss": 1.4813, "grad_norm": 0.9343693852424622, "learning_rate": 0.0002, "epoch": 4.581239530988275, "step": 5470}, {"loss": 1.385, "grad_norm": 0.869945764541626, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 5480}, {"loss": 1.4067, "grad_norm": 0.7383785843849182, "learning_rate": 0.0002, "epoch": 4.597989949748744, "step": 5490}, {"loss": 1.3698, "grad_norm": 0.7988699674606323, "learning_rate": 0.0002, "epoch": 4.606365159128979, "step": 5500}, {"loss": 1.3834, "grad_norm": 0.8731256127357483, "learning_rate": 0.0002, "epoch": 4.614740368509213, "step": 5510}, {"loss": 1.4393, "grad_norm": 0.7577664256095886, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 5520}, {"loss": 1.4418, "grad_norm": 0.7825039625167847, "learning_rate": 0.0002, "epoch": 4.631490787269682, "step": 5530}, {"loss": 1.4594, "grad_norm": 0.8534902930259705, "learning_rate": 0.0002, "epoch": 4.639865996649917, "step": 5540}, {"loss": 1.3689, "grad_norm": 0.7403318285942078, "learning_rate": 0.0002, "epoch": 4.648241206030151, "step": 5550}, {"loss": 1.4456, "grad_norm": 0.8229990005493164, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 5560}, {"loss": 1.3854, "grad_norm": 0.8279513716697693, "learning_rate": 0.0002, "epoch": 4.66499162479062, "step": 5570}, {"loss": 1.4472, "grad_norm": 0.8923851251602173, "learning_rate": 0.0002, "epoch": 4.673366834170855, "step": 5580}, {"loss": 1.3999, "grad_norm": 0.7457540035247803, "learning_rate": 0.0002, "epoch": 4.681742043551089, "step": 5590}, {"loss": 1.4341, "grad_norm": 0.7110715508460999, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 5600}, {"loss": 1.4327, "grad_norm": 0.7135499119758606, "learning_rate": 0.0002, "epoch": 4.698492462311558, "step": 5610}, {"loss": 1.4321, "grad_norm": 0.7606837153434753, "learning_rate": 0.0002, "epoch": 4.706867671691793, "step": 5620}, {"loss": 1.3792, "grad_norm": 0.9622916579246521, "learning_rate": 0.0002, "epoch": 4.715242881072027, "step": 5630}, {"loss": 1.4, "grad_norm": 0.7665684819221497, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 5640}, {"loss": 1.3837, "grad_norm": 0.7985475659370422, "learning_rate": 0.0002, "epoch": 4.731993299832496, "step": 5650}, {"loss": 1.397, "grad_norm": 0.9179279208183289, "learning_rate": 0.0002, "epoch": 4.740368509212731, "step": 5660}, {"loss": 1.4379, "grad_norm": 0.8311634063720703, "learning_rate": 0.0002, "epoch": 4.748743718592965, "step": 5670}, {"loss": 1.3546, "grad_norm": 0.7773269414901733, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 5680}, {"loss": 1.4031, "grad_norm": 0.7771748900413513, "learning_rate": 0.0002, "epoch": 4.765494137353434, "step": 5690}, {"loss": 1.3724, "grad_norm": 0.7518507242202759, "learning_rate": 0.0002, "epoch": 4.773869346733669, "step": 5700}, {"loss": 1.3247, "grad_norm": 0.7699326276779175, "learning_rate": 0.0002, "epoch": 4.782244556113903, "step": 5710}, {"loss": 1.437, "grad_norm": 0.7001115679740906, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 5720}, {"loss": 1.4257, "grad_norm": 0.7220682501792908, "learning_rate": 0.0002, "epoch": 4.798994974874372, "step": 5730}, {"loss": 1.4174, "grad_norm": 0.7654005289077759, "learning_rate": 0.0002, "epoch": 4.807370184254607, "step": 5740}, {"loss": 1.3792, "grad_norm": 0.8132795095443726, "learning_rate": 0.0002, "epoch": 4.815745393634841, "step": 5750}, {"loss": 1.4007, "grad_norm": 0.7105404138565063, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 5760}, {"loss": 1.4289, "grad_norm": 0.9346209764480591, "learning_rate": 0.0002, "epoch": 4.83249581239531, "step": 5770}, {"loss": 1.4066, "grad_norm": 1.0075623989105225, "learning_rate": 0.0002, "epoch": 4.840871021775545, "step": 5780}, {"loss": 1.4558, "grad_norm": 0.758376955986023, "learning_rate": 0.0002, "epoch": 4.849246231155779, "step": 5790}, {"loss": 1.4117, "grad_norm": 0.854821503162384, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 5800}, {"loss": 1.4014, "grad_norm": 0.8226943016052246, "learning_rate": 0.0002, "epoch": 4.865996649916248, "step": 5810}, {"loss": 1.3963, "grad_norm": 0.7510473728179932, "learning_rate": 0.0002, "epoch": 4.874371859296483, "step": 5820}, {"loss": 1.4463, "grad_norm": 0.7449678182601929, "learning_rate": 0.0002, "epoch": 4.882747068676717, "step": 5830}, {"loss": 1.3691, "grad_norm": 0.7840824723243713, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 5840}, {"loss": 1.3795, "grad_norm": 0.8811169862747192, "learning_rate": 0.0002, "epoch": 4.899497487437186, "step": 5850}, {"loss": 1.3827, "grad_norm": 0.84914630651474, "learning_rate": 0.0002, "epoch": 4.907872696817421, "step": 5860}, {"loss": 1.4549, "grad_norm": 0.7514461874961853, "learning_rate": 0.0002, "epoch": 4.916247906197655, "step": 5870}, {"loss": 1.3633, "grad_norm": 0.7229002118110657, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 5880}, {"loss": 1.4302, "grad_norm": 0.9418245553970337, "learning_rate": 0.0002, "epoch": 4.932998324958124, "step": 5890}, {"loss": 1.4747, "grad_norm": 0.7626827359199524, "learning_rate": 0.0002, "epoch": 4.941373534338359, "step": 5900}, {"loss": 1.4462, "grad_norm": 0.7711105346679688, "learning_rate": 0.0002, "epoch": 4.949748743718593, "step": 5910}, {"loss": 1.4104, "grad_norm": 0.8689648509025574, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 5920}, {"loss": 1.4273, "grad_norm": 0.7873271107673645, "learning_rate": 0.0002, "epoch": 4.966499162479062, "step": 5930}, {"loss": 1.4361, "grad_norm": 0.7637495994567871, "learning_rate": 0.0002, "epoch": 4.974874371859297, "step": 5940}, {"loss": 1.5037, "grad_norm": 0.9907955527305603, "learning_rate": 0.0002, "epoch": 4.983249581239531, "step": 5950}, {"loss": 1.4476, "grad_norm": 0.7827328443527222, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 5960}, {"loss": 1.4252, "grad_norm": 0.818544328212738, "learning_rate": 0.0002, "epoch": 5.0, "step": 5970}, {"eval_loss": 1.9436752796173096, "eval_runtime": 38.087, "eval_samples_per_second": 13.522, "eval_steps_per_second": 1.707, "epoch": 5.0, "step": 5970}, {"loss": 1.2367, "grad_norm": 1.1248953342437744, "learning_rate": 0.0002, "epoch": 5.008375209380235, "step": 5980}, {"loss": 1.2221, "grad_norm": 0.9285888075828552, "learning_rate": 0.0002, "epoch": 5.016750418760469, "step": 5990}, {"loss": 1.263, "grad_norm": 0.8626338839530945, "learning_rate": 0.0002, "epoch": 5.025125628140704, "step": 6000}, {"loss": 1.1839, "grad_norm": 0.8253921270370483, "learning_rate": 0.0002, "epoch": 5.033500837520938, "step": 6010}, {"loss": 1.2773, "grad_norm": 1.079628586769104, "learning_rate": 0.0002, "epoch": 5.041876046901173, "step": 6020}, {"loss": 1.2419, "grad_norm": 0.902625322341919, "learning_rate": 0.0002, "epoch": 5.050251256281407, "step": 6030}, {"loss": 1.164, "grad_norm": 0.9593151211738586, "learning_rate": 0.0002, "epoch": 5.058626465661642, "step": 6040}, {"loss": 1.2442, "grad_norm": 0.9276060461997986, "learning_rate": 0.0002, "epoch": 5.067001675041876, "step": 6050}, {"loss": 1.2496, "grad_norm": 1.0472362041473389, "learning_rate": 0.0002, "epoch": 5.075376884422111, "step": 6060}, {"loss": 1.2241, "grad_norm": 0.9126865863800049, "learning_rate": 0.0002, "epoch": 5.083752093802345, "step": 6070}, {"loss": 1.1997, "grad_norm": 1.0797888040542603, "learning_rate": 0.0002, "epoch": 5.09212730318258, "step": 6080}, {"loss": 1.2299, "grad_norm": 0.9538877010345459, "learning_rate": 0.0002, "epoch": 5.100502512562814, "step": 6090}, {"loss": 1.2585, "grad_norm": 1.0604161024093628, "learning_rate": 0.0002, "epoch": 5.108877721943049, "step": 6100}, {"loss": 1.2627, "grad_norm": 1.0178192853927612, "learning_rate": 0.0002, "epoch": 5.117252931323283, "step": 6110}, {"loss": 1.2848, "grad_norm": 1.0262689590454102, "learning_rate": 0.0002, "epoch": 5.125628140703517, "step": 6120}, {"loss": 1.228, "grad_norm": 0.9046729803085327, "learning_rate": 0.0002, "epoch": 5.134003350083752, "step": 6130}, {"loss": 1.2051, "grad_norm": 1.1244608163833618, "learning_rate": 0.0002, "epoch": 5.142378559463987, "step": 6140}, {"loss": 1.2751, "grad_norm": 1.082835078239441, "learning_rate": 0.0002, "epoch": 5.150753768844221, "step": 6150}, {"loss": 1.1625, "grad_norm": 0.9078734517097473, "learning_rate": 0.0002, "epoch": 5.159128978224456, "step": 6160}, {"loss": 1.2122, "grad_norm": 1.0688848495483398, "learning_rate": 0.0002, "epoch": 5.16750418760469, "step": 6170}, {"loss": 1.2143, "grad_norm": 1.137519359588623, "learning_rate": 0.0002, "epoch": 5.175879396984925, "step": 6180}, {"loss": 1.3125, "grad_norm": 1.0728670358657837, "learning_rate": 0.0002, "epoch": 5.184254606365159, "step": 6190}, {"loss": 1.2352, "grad_norm": 1.2384949922561646, "learning_rate": 0.0002, "epoch": 5.192629815745394, "step": 6200}, {"loss": 1.2173, "grad_norm": 0.8391274809837341, "learning_rate": 0.0002, "epoch": 5.201005025125628, "step": 6210}, {"loss": 1.2179, "grad_norm": 0.8948764801025391, "learning_rate": 0.0002, "epoch": 5.209380234505863, "step": 6220}, {"loss": 1.2467, "grad_norm": 0.9568309783935547, "learning_rate": 0.0002, "epoch": 5.217755443886097, "step": 6230}, {"loss": 1.2761, "grad_norm": 1.0604485273361206, "learning_rate": 0.0002, "epoch": 5.226130653266332, "step": 6240}, {"loss": 1.1407, "grad_norm": 1.1278935670852661, "learning_rate": 0.0002, "epoch": 5.234505862646566, "step": 6250}, {"loss": 1.2332, "grad_norm": 0.9903607368469238, "learning_rate": 0.0002, "epoch": 5.242881072026801, "step": 6260}, {"loss": 1.2544, "grad_norm": 0.958718478679657, "learning_rate": 0.0002, "epoch": 5.251256281407035, "step": 6270}, {"loss": 1.2746, "grad_norm": 1.127510905265808, "learning_rate": 0.0002, "epoch": 5.259631490787269, "step": 6280}, {"loss": 1.2589, "grad_norm": 1.1683127880096436, "learning_rate": 0.0002, "epoch": 5.268006700167504, "step": 6290}, {"loss": 1.2959, "grad_norm": 1.0723326206207275, "learning_rate": 0.0002, "epoch": 5.276381909547739, "step": 6300}, {"loss": 1.2522, "grad_norm": 0.9285374283790588, "learning_rate": 0.0002, "epoch": 5.284757118927973, "step": 6310}, {"loss": 1.2539, "grad_norm": 0.9201741218566895, "learning_rate": 0.0002, "epoch": 5.293132328308207, "step": 6320}, {"loss": 1.1816, "grad_norm": 0.9606702923774719, "learning_rate": 0.0002, "epoch": 5.301507537688442, "step": 6330}, {"loss": 1.2928, "grad_norm": 1.107960820198059, "learning_rate": 0.0002, "epoch": 5.309882747068677, "step": 6340}, {"loss": 1.209, "grad_norm": 0.9342933297157288, "learning_rate": 0.0002, "epoch": 5.318257956448911, "step": 6350}, {"loss": 1.2023, "grad_norm": 0.9170576930046082, "learning_rate": 0.0002, "epoch": 5.326633165829146, "step": 6360}, {"loss": 1.2239, "grad_norm": 0.7612091898918152, "learning_rate": 0.0002, "epoch": 5.33500837520938, "step": 6370}, {"loss": 1.2176, "grad_norm": 1.2524093389511108, "learning_rate": 0.0002, "epoch": 5.343383584589615, "step": 6380}, {"loss": 1.219, "grad_norm": 0.8481650352478027, "learning_rate": 0.0002, "epoch": 5.351758793969849, "step": 6390}, {"loss": 1.237, "grad_norm": 1.0562204122543335, "learning_rate": 0.0002, "epoch": 5.360134003350084, "step": 6400}, {"loss": 1.1844, "grad_norm": 0.96522456407547, "learning_rate": 0.0002, "epoch": 5.368509212730318, "step": 6410}, {"loss": 1.2465, "grad_norm": 0.9680143594741821, "learning_rate": 0.0002, "epoch": 5.376884422110553, "step": 6420}, {"loss": 1.2809, "grad_norm": 0.9743781685829163, "learning_rate": 0.0002, "epoch": 5.385259631490787, "step": 6430}, {"loss": 1.2637, "grad_norm": 0.8907374143600464, "learning_rate": 0.0002, "epoch": 5.393634840871022, "step": 6440}, {"loss": 1.2174, "grad_norm": 1.3755217790603638, "learning_rate": 0.0002, "epoch": 5.402010050251256, "step": 6450}, {"loss": 1.224, "grad_norm": 1.1926233768463135, "learning_rate": 0.0002, "epoch": 5.410385259631491, "step": 6460}, {"loss": 1.1685, "grad_norm": 0.8343448638916016, "learning_rate": 0.0002, "epoch": 5.418760469011725, "step": 6470}, {"loss": 1.232, "grad_norm": 1.0056027173995972, "learning_rate": 0.0002, "epoch": 5.42713567839196, "step": 6480}, {"loss": 1.2936, "grad_norm": 0.9482131600379944, "learning_rate": 0.0002, "epoch": 5.435510887772194, "step": 6490}, {"loss": 1.3084, "grad_norm": 0.9766585826873779, "learning_rate": 0.0002, "epoch": 5.443886097152429, "step": 6500}, {"loss": 1.2758, "grad_norm": 0.9226584434509277, "learning_rate": 0.0002, "epoch": 5.452261306532663, "step": 6510}, {"loss": 1.328, "grad_norm": 0.9605025053024292, "learning_rate": 0.0002, "epoch": 5.460636515912898, "step": 6520}, {"loss": 1.3285, "grad_norm": 1.0022773742675781, "learning_rate": 0.0002, "epoch": 5.469011725293132, "step": 6530}, {"loss": 1.3126, "grad_norm": 1.056764841079712, "learning_rate": 0.0002, "epoch": 5.477386934673367, "step": 6540}, {"loss": 1.3018, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 5.485762144053601, "step": 6550}, {"loss": 1.2633, "grad_norm": 0.8987206816673279, "learning_rate": 0.0002, "epoch": 5.494137353433836, "step": 6560}, {"loss": 1.2356, "grad_norm": 1.1946845054626465, "learning_rate": 0.0002, "epoch": 5.50251256281407, "step": 6570}, {"loss": 1.2613, "grad_norm": 1.037416696548462, "learning_rate": 0.0002, "epoch": 5.510887772194305, "step": 6580}, {"loss": 1.2873, "grad_norm": 1.085598349571228, "learning_rate": 0.0002, "epoch": 5.519262981574539, "step": 6590}, {"loss": 1.2562, "grad_norm": 0.9253745079040527, "learning_rate": 0.0002, "epoch": 5.527638190954773, "step": 6600}, {"loss": 1.3037, "grad_norm": 1.0624418258666992, "learning_rate": 0.0002, "epoch": 5.536013400335008, "step": 6610}, {"loss": 1.2523, "grad_norm": 1.002821922302246, "learning_rate": 0.0002, "epoch": 5.544388609715243, "step": 6620}, {"loss": 1.2662, "grad_norm": 0.9343662858009338, "learning_rate": 0.0002, "epoch": 5.552763819095477, "step": 6630}, {"loss": 1.2467, "grad_norm": 0.9129965305328369, "learning_rate": 0.0002, "epoch": 5.561139028475711, "step": 6640}, {"loss": 1.2931, "grad_norm": 1.220263957977295, "learning_rate": 0.0002, "epoch": 5.569514237855946, "step": 6650}, {"loss": 1.2638, "grad_norm": 0.9705421924591064, "learning_rate": 0.0002, "epoch": 5.577889447236181, "step": 6660}, {"loss": 1.2815, "grad_norm": 0.8417587876319885, "learning_rate": 0.0002, "epoch": 5.586264656616415, "step": 6670}, {"loss": 1.3616, "grad_norm": 0.9351304769515991, "learning_rate": 0.0002, "epoch": 5.594639865996649, "step": 6680}, {"loss": 1.2795, "grad_norm": 1.012598991394043, "learning_rate": 0.0002, "epoch": 5.603015075376884, "step": 6690}, {"loss": 1.2457, "grad_norm": 1.018328309059143, "learning_rate": 0.0002, "epoch": 5.611390284757119, "step": 6700}, {"loss": 1.3084, "grad_norm": 0.9289278388023376, "learning_rate": 0.0002, "epoch": 5.619765494137353, "step": 6710}, {"loss": 1.2645, "grad_norm": 0.8390841484069824, "learning_rate": 0.0002, "epoch": 5.628140703517588, "step": 6720}, {"loss": 1.2676, "grad_norm": 0.9989390969276428, "learning_rate": 0.0002, "epoch": 5.636515912897822, "step": 6730}, {"loss": 1.2937, "grad_norm": 1.0675761699676514, "learning_rate": 0.0002, "epoch": 5.644891122278057, "step": 6740}, {"loss": 1.2599, "grad_norm": 1.0649791955947876, "learning_rate": 0.0002, "epoch": 5.653266331658291, "step": 6750}, {"loss": 1.2191, "grad_norm": 0.8542222380638123, "learning_rate": 0.0002, "epoch": 5.661641541038526, "step": 6760}, {"loss": 1.2336, "grad_norm": 0.9148173928260803, "learning_rate": 0.0002, "epoch": 5.67001675041876, "step": 6770}, {"loss": 1.3286, "grad_norm": 0.978024423122406, "learning_rate": 0.0002, "epoch": 5.678391959798995, "step": 6780}, {"loss": 1.2821, "grad_norm": 1.0385138988494873, "learning_rate": 0.0002, "epoch": 5.686767169179229, "step": 6790}, {"loss": 1.218, "grad_norm": 0.9687889218330383, "learning_rate": 0.0002, "epoch": 5.695142378559464, "step": 6800}, {"loss": 1.3256, "grad_norm": 0.862335205078125, "learning_rate": 0.0002, "epoch": 5.703517587939698, "step": 6810}, {"loss": 1.2783, "grad_norm": 0.9729578495025635, "learning_rate": 0.0002, "epoch": 5.711892797319933, "step": 6820}, {"loss": 1.3318, "grad_norm": 0.8936806321144104, "learning_rate": 0.0002, "epoch": 5.720268006700167, "step": 6830}, {"loss": 1.27, "grad_norm": 0.9222455620765686, "learning_rate": 0.0002, "epoch": 5.728643216080402, "step": 6840}, {"loss": 1.2097, "grad_norm": 1.0584437847137451, "learning_rate": 0.0002, "epoch": 5.7370184254606365, "step": 6850}, {"loss": 1.2308, "grad_norm": 0.9114518165588379, "learning_rate": 0.0002, "epoch": 5.745393634840871, "step": 6860}, {"loss": 1.2767, "grad_norm": 0.9590078592300415, "learning_rate": 0.0002, "epoch": 5.7537688442211055, "step": 6870}, {"loss": 1.2639, "grad_norm": 0.9056822061538696, "learning_rate": 0.0002, "epoch": 5.76214405360134, "step": 6880}, {"loss": 1.3257, "grad_norm": 1.0069063901901245, "learning_rate": 0.0002, "epoch": 5.7705192629815745, "step": 6890}, {"loss": 1.3382, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 5.778894472361809, "step": 6900}, {"loss": 1.2907, "grad_norm": 0.881629228591919, "learning_rate": 0.0002, "epoch": 5.7872696817420435, "step": 6910}, {"loss": 1.3122, "grad_norm": 1.1020095348358154, "learning_rate": 0.0002, "epoch": 5.795644891122278, "step": 6920}, {"loss": 1.2985, "grad_norm": 0.8774619102478027, "learning_rate": 0.0002, "epoch": 5.8040201005025125, "step": 6930}, {"loss": 1.311, "grad_norm": 0.9321739673614502, "learning_rate": 0.0002, "epoch": 5.812395309882747, "step": 6940}, {"loss": 1.2951, "grad_norm": 0.9082857966423035, "learning_rate": 0.0002, "epoch": 5.8207705192629815, "step": 6950}, {"loss": 1.2582, "grad_norm": 0.9119554758071899, "learning_rate": 0.0002, "epoch": 5.8291457286432165, "step": 6960}, {"loss": 1.2777, "grad_norm": 1.0643284320831299, "learning_rate": 0.0002, "epoch": 5.8375209380234505, "step": 6970}, {"loss": 1.3319, "grad_norm": 0.8526089787483215, "learning_rate": 0.0002, "epoch": 5.8458961474036855, "step": 6980}, {"loss": 1.2539, "grad_norm": 0.930439829826355, "learning_rate": 0.0002, "epoch": 5.8542713567839195, "step": 6990}, {"loss": 1.3059, "grad_norm": 1.0461677312850952, "learning_rate": 0.0002, "epoch": 5.8626465661641545, "step": 7000}, {"loss": 1.2623, "grad_norm": 0.92561936378479, "learning_rate": 0.0002, "epoch": 5.8710217755443885, "step": 7010}, {"loss": 1.2354, "grad_norm": 0.8936395049095154, "learning_rate": 0.0002, "epoch": 5.8793969849246235, "step": 7020}, {"loss": 1.3232, "grad_norm": 0.986539363861084, "learning_rate": 0.0002, "epoch": 5.8877721943048575, "step": 7030}, {"loss": 1.2399, "grad_norm": 0.8776476383209229, "learning_rate": 0.0002, "epoch": 5.8961474036850925, "step": 7040}, {"loss": 1.2374, "grad_norm": 1.0256905555725098, "learning_rate": 0.0002, "epoch": 5.9045226130653266, "step": 7050}, {"loss": 1.3049, "grad_norm": 0.96241295337677, "learning_rate": 0.0002, "epoch": 5.9128978224455615, "step": 7060}, {"loss": 1.2349, "grad_norm": 1.0251280069351196, "learning_rate": 0.0002, "epoch": 5.921273031825796, "step": 7070}, {"loss": 1.2225, "grad_norm": 1.0794076919555664, "learning_rate": 0.0002, "epoch": 5.9296482412060305, "step": 7080}, {"loss": 1.2978, "grad_norm": 0.9852448105812073, "learning_rate": 0.0002, "epoch": 5.938023450586265, "step": 7090}, {"loss": 1.3278, "grad_norm": 1.1678671836853027, "learning_rate": 0.0002, "epoch": 5.9463986599664995, "step": 7100}, {"loss": 1.2908, "grad_norm": 0.9818310141563416, "learning_rate": 0.0002, "epoch": 5.954773869346734, "step": 7110}, {"loss": 1.3406, "grad_norm": 1.0732046365737915, "learning_rate": 0.0002, "epoch": 5.9631490787269685, "step": 7120}, {"loss": 1.2402, "grad_norm": 0.912470281124115, "learning_rate": 0.0002, "epoch": 5.971524288107203, "step": 7130}, {"loss": 1.2979, "grad_norm": 1.0944788455963135, "learning_rate": 0.0002, "epoch": 5.9798994974874375, "step": 7140}, {"loss": 1.3249, "grad_norm": 1.0393965244293213, "learning_rate": 0.0002, "epoch": 5.988274706867672, "step": 7150}, {"loss": 1.2913, "grad_norm": 0.8758739233016968, "learning_rate": 0.0002, "epoch": 5.9966499162479066, "step": 7160}]} +{"epoch": 7.0, "step": 8358, "epoch_duration": 1340.3200409412384, "total_accumulated_duration": 9270.30408000946, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}, {"eval_loss": 1.8088148832321167, "eval_runtime": 37.9609, "eval_samples_per_second": 13.567, "eval_steps_per_second": 1.712, "epoch": 1.0, "step": 1194}, {"loss": 1.7492, "grad_norm": 0.4256260097026825, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.35050156712532043, "learning_rate": 0.0002, "epoch": 1.0134003350083751, "step": 1210}, {"loss": 1.7422, "grad_norm": 0.34773948788642883, "learning_rate": 0.0002, "epoch": 1.0217755443886096, "step": 1220}, {"loss": 1.7803, "grad_norm": 0.35487470030784607, "learning_rate": 0.0002, "epoch": 1.0301507537688441, "step": 1230}, {"loss": 1.7095, "grad_norm": 0.37040361762046814, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1240}, {"loss": 1.7663, "grad_norm": 0.33740508556365967, "learning_rate": 0.0002, "epoch": 1.0469011725293131, "step": 1250}, {"loss": 1.7485, "grad_norm": 0.3962724506855011, "learning_rate": 0.0002, "epoch": 1.0552763819095476, "step": 1260}, {"loss": 1.7334, "grad_norm": 0.3129824101924896, "learning_rate": 0.0002, "epoch": 1.0636515912897822, "step": 1270}, {"loss": 1.8068, "grad_norm": 0.3620055019855499, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1280}, {"loss": 1.7823, "grad_norm": 0.3480982184410095, "learning_rate": 0.0002, "epoch": 1.0804020100502512, "step": 1290}, {"loss": 1.7081, "grad_norm": 0.344424843788147, "learning_rate": 0.0002, "epoch": 1.0887772194304857, "step": 1300}, {"loss": 1.7366, "grad_norm": 0.3480122685432434, "learning_rate": 0.0002, "epoch": 1.0971524288107202, "step": 1310}, {"loss": 1.7029, "grad_norm": 0.323662132024765, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1320}, {"loss": 1.7517, "grad_norm": 0.35440102219581604, "learning_rate": 0.0002, "epoch": 1.1139028475711892, "step": 1330}, {"loss": 1.7573, "grad_norm": 0.3342263698577881, "learning_rate": 0.0002, "epoch": 1.1222780569514237, "step": 1340}, {"loss": 1.7134, "grad_norm": 0.35705259442329407, "learning_rate": 0.0002, "epoch": 1.1306532663316582, "step": 1350}, {"loss": 1.64, "grad_norm": 0.38021907210350037, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1360}, {"loss": 1.66, "grad_norm": 0.34918731451034546, "learning_rate": 0.0002, "epoch": 1.1474036850921272, "step": 1370}, {"loss": 1.7628, "grad_norm": 0.371868371963501, "learning_rate": 0.0002, "epoch": 1.1557788944723617, "step": 1380}, {"loss": 1.725, "grad_norm": 0.38413912057876587, "learning_rate": 0.0002, "epoch": 1.1641541038525962, "step": 1390}, {"loss": 1.6948, "grad_norm": 0.3898005187511444, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1400}, {"loss": 1.8105, "grad_norm": 0.3726498484611511, "learning_rate": 0.0002, "epoch": 1.1809045226130652, "step": 1410}, {"loss": 1.7379, "grad_norm": 0.3532905876636505, "learning_rate": 0.0002, "epoch": 1.1892797319932997, "step": 1420}, {"loss": 1.6699, "grad_norm": 0.338127464056015, "learning_rate": 0.0002, "epoch": 1.1976549413735342, "step": 1430}, {"loss": 1.871, "grad_norm": 0.3472749888896942, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1440}, {"loss": 1.7092, "grad_norm": 0.3523476719856262, "learning_rate": 0.0002, "epoch": 1.2144053601340032, "step": 1450}, {"loss": 1.7329, "grad_norm": 0.42986124753952026, "learning_rate": 0.0002, "epoch": 1.2227805695142377, "step": 1460}, {"loss": 1.7459, "grad_norm": 0.38195517659187317, "learning_rate": 0.0002, "epoch": 1.2311557788944723, "step": 1470}, {"loss": 1.7539, "grad_norm": 0.31665122509002686, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1480}, {"loss": 1.7224, "grad_norm": 0.3539541959762573, "learning_rate": 0.0002, "epoch": 1.2479061976549413, "step": 1490}, {"loss": 1.7655, "grad_norm": 0.40162816643714905, "learning_rate": 0.0002, "epoch": 1.2562814070351758, "step": 1500}, {"loss": 1.702, "grad_norm": 0.34727150201797485, "learning_rate": 0.0002, "epoch": 1.2646566164154103, "step": 1510}, {"loss": 1.7804, "grad_norm": 0.3364993929862976, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1520}, {"loss": 1.8063, "grad_norm": 0.323483943939209, "learning_rate": 0.0002, "epoch": 1.2814070351758793, "step": 1530}, {"loss": 1.7622, "grad_norm": 0.4114733934402466, "learning_rate": 0.0002, "epoch": 1.2897822445561138, "step": 1540}, {"loss": 1.6525, "grad_norm": 0.37476620078086853, "learning_rate": 0.0002, "epoch": 1.2981574539363483, "step": 1550}, {"loss": 1.7225, "grad_norm": 0.4216269552707672, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1560}, {"loss": 1.6995, "grad_norm": 0.3204927444458008, "learning_rate": 0.0002, "epoch": 1.3149078726968173, "step": 1570}, {"loss": 1.7132, "grad_norm": 0.36916354298591614, "learning_rate": 0.0002, "epoch": 1.3232830820770518, "step": 1580}, {"loss": 1.7383, "grad_norm": 0.3755691647529602, "learning_rate": 0.0002, "epoch": 1.3316582914572863, "step": 1590}, {"loss": 1.7351, "grad_norm": 0.3688889443874359, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 1600}, {"loss": 1.7664, "grad_norm": 0.34306398034095764, "learning_rate": 0.0002, "epoch": 1.3484087102177553, "step": 1610}, {"loss": 1.6943, "grad_norm": 0.3651525676250458, "learning_rate": 0.0002, "epoch": 1.3567839195979898, "step": 1620}, {"loss": 1.7206, "grad_norm": 0.3461526036262512, "learning_rate": 0.0002, "epoch": 1.3651591289782243, "step": 1630}, {"loss": 1.728, "grad_norm": 0.37959185242652893, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 1640}, {"loss": 1.746, "grad_norm": 0.4005356431007385, "learning_rate": 0.0002, "epoch": 1.3819095477386933, "step": 1650}, {"loss": 1.694, "grad_norm": 0.3537434935569763, "learning_rate": 0.0002, "epoch": 1.3902847571189278, "step": 1660}, {"loss": 1.6679, "grad_norm": 0.38220855593681335, "learning_rate": 0.0002, "epoch": 1.3986599664991624, "step": 1670}, {"loss": 1.7721, "grad_norm": 0.3573434352874756, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 1680}, {"loss": 1.6983, "grad_norm": 0.40028059482574463, "learning_rate": 0.0002, "epoch": 1.4154103852596314, "step": 1690}, {"loss": 1.7049, "grad_norm": 0.3953610360622406, "learning_rate": 0.0002, "epoch": 1.4237855946398659, "step": 1700}, {"loss": 1.7126, "grad_norm": 0.39524543285369873, "learning_rate": 0.0002, "epoch": 1.4321608040201004, "step": 1710}, {"loss": 1.8319, "grad_norm": 0.37721359729766846, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 1720}, {"loss": 1.7387, "grad_norm": 0.4220093786716461, "learning_rate": 0.0002, "epoch": 1.4489112227805694, "step": 1730}, {"loss": 1.7495, "grad_norm": 0.3876369595527649, "learning_rate": 0.0002, "epoch": 1.457286432160804, "step": 1740}, {"loss": 1.6859, "grad_norm": 0.3774619400501251, "learning_rate": 0.0002, "epoch": 1.4656616415410384, "step": 1750}, {"loss": 1.7223, "grad_norm": 0.3608052432537079, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 1760}, {"loss": 1.6746, "grad_norm": 0.32083916664123535, "learning_rate": 0.0002, "epoch": 1.4824120603015074, "step": 1770}, {"loss": 1.716, "grad_norm": 0.32290884852409363, "learning_rate": 0.0002, "epoch": 1.490787269681742, "step": 1780}, {"loss": 1.7648, "grad_norm": 0.3537974953651428, "learning_rate": 0.0002, "epoch": 1.4991624790619764, "step": 1790}, {"loss": 1.6784, "grad_norm": 0.36576104164123535, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 1800}, {"loss": 1.6818, "grad_norm": 0.3336752653121948, "learning_rate": 0.0002, "epoch": 1.5159128978224454, "step": 1810}, {"loss": 1.7425, "grad_norm": 0.3551652431488037, "learning_rate": 0.0002, "epoch": 1.52428810720268, "step": 1820}, {"loss": 1.6997, "grad_norm": 0.43313586711883545, "learning_rate": 0.0002, "epoch": 1.5326633165829144, "step": 1830}, {"loss": 1.7358, "grad_norm": 0.39160311222076416, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 1840}, {"loss": 1.7709, "grad_norm": 0.38758179545402527, "learning_rate": 0.0002, "epoch": 1.5494137353433834, "step": 1850}, {"loss": 1.7768, "grad_norm": 0.3658832013607025, "learning_rate": 0.0002, "epoch": 1.557788944723618, "step": 1860}, {"loss": 1.7486, "grad_norm": 0.375372052192688, "learning_rate": 0.0002, "epoch": 1.5661641541038525, "step": 1870}, {"loss": 1.6555, "grad_norm": 0.3586942255496979, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 1880}, {"loss": 1.7314, "grad_norm": 0.3626467287540436, "learning_rate": 0.0002, "epoch": 1.5829145728643215, "step": 1890}, {"loss": 1.7943, "grad_norm": 0.4199363589286804, "learning_rate": 0.0002, "epoch": 1.591289782244556, "step": 1900}, {"loss": 1.6551, "grad_norm": 0.35646331310272217, "learning_rate": 0.0002, "epoch": 1.5996649916247905, "step": 1910}, {"loss": 1.7125, "grad_norm": 0.3465106189250946, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 1920}, {"loss": 1.8507, "grad_norm": 0.43392884731292725, "learning_rate": 0.0002, "epoch": 1.6164154103852595, "step": 1930}, {"loss": 1.7009, "grad_norm": 0.39187198877334595, "learning_rate": 0.0002, "epoch": 1.624790619765494, "step": 1940}, {"loss": 1.7202, "grad_norm": 0.3685080409049988, "learning_rate": 0.0002, "epoch": 1.6331658291457285, "step": 1950}, {"loss": 1.6607, "grad_norm": 0.4044491946697235, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 1960}, {"loss": 1.7234, "grad_norm": 0.4388049244880676, "learning_rate": 0.0002, "epoch": 1.6499162479061975, "step": 1970}, {"loss": 1.7178, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.658291457286432, "step": 1980}, {"loss": 1.75, "grad_norm": 0.3501148521900177, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1990}, {"loss": 1.7057, "grad_norm": 0.3751881718635559, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2000}, {"loss": 1.7209, "grad_norm": 0.3902788460254669, "learning_rate": 0.0002, "epoch": 1.6834170854271355, "step": 2010}, {"loss": 1.8517, "grad_norm": 0.39642134308815, "learning_rate": 0.0002, "epoch": 1.69179229480737, "step": 2020}, {"loss": 1.6623, "grad_norm": 0.35721203684806824, "learning_rate": 0.0002, "epoch": 1.7001675041876045, "step": 2030}, {"loss": 1.6988, "grad_norm": 0.360419899225235, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2040}, {"loss": 1.691, "grad_norm": 0.3755600154399872, "learning_rate": 0.0002, "epoch": 1.7169179229480735, "step": 2050}, {"loss": 1.6726, "grad_norm": 0.3939184844493866, "learning_rate": 0.0002, "epoch": 1.725293132328308, "step": 2060}, {"loss": 1.7326, "grad_norm": 0.33955490589141846, "learning_rate": 0.0002, "epoch": 1.7336683417085426, "step": 2070}, {"loss": 1.6794, "grad_norm": 0.35501939058303833, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2080}, {"loss": 1.7312, "grad_norm": 0.38298022747039795, "learning_rate": 0.0002, "epoch": 1.7504187604690116, "step": 2090}, {"loss": 1.6602, "grad_norm": 0.3472785949707031, "learning_rate": 0.0002, "epoch": 1.758793969849246, "step": 2100}, {"loss": 1.6671, "grad_norm": 0.3620430827140808, "learning_rate": 0.0002, "epoch": 1.7671691792294806, "step": 2110}, {"loss": 1.671, "grad_norm": 0.3795909881591797, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2120}, {"loss": 1.7193, "grad_norm": 0.3662523925304413, "learning_rate": 0.0002, "epoch": 1.7839195979899496, "step": 2130}, {"loss": 1.7764, "grad_norm": 0.4113886058330536, "learning_rate": 0.0002, "epoch": 1.792294807370184, "step": 2140}, {"loss": 1.6681, "grad_norm": 0.3765672743320465, "learning_rate": 0.0002, "epoch": 1.8006700167504186, "step": 2150}, {"loss": 1.7481, "grad_norm": 0.41623714566230774, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2160}, {"loss": 1.712, "grad_norm": 0.3724099099636078, "learning_rate": 0.0002, "epoch": 1.8174204355108876, "step": 2170}, {"loss": 1.6912, "grad_norm": 0.3990779221057892, "learning_rate": 0.0002, "epoch": 1.8257956448911221, "step": 2180}, {"loss": 1.7361, "grad_norm": 0.3677702844142914, "learning_rate": 0.0002, "epoch": 1.8341708542713566, "step": 2190}, {"loss": 1.6705, "grad_norm": 0.3944959342479706, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2200}, {"loss": 1.7619, "grad_norm": 0.3413957357406616, "learning_rate": 0.0002, "epoch": 1.8509212730318256, "step": 2210}, {"loss": 1.7069, "grad_norm": 0.40136098861694336, "learning_rate": 0.0002, "epoch": 1.8592964824120601, "step": 2220}, {"loss": 1.6865, "grad_norm": 0.3496319055557251, "learning_rate": 0.0002, "epoch": 1.8676716917922946, "step": 2230}, {"loss": 1.6906, "grad_norm": 0.3759860694408417, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2240}, {"loss": 1.8394, "grad_norm": 0.43556007742881775, "learning_rate": 0.0002, "epoch": 1.8844221105527639, "step": 2250}, {"loss": 1.66, "grad_norm": 0.3864828944206238, "learning_rate": 0.0002, "epoch": 1.8927973199329984, "step": 2260}, {"loss": 1.6502, "grad_norm": 0.396930456161499, "learning_rate": 0.0002, "epoch": 1.9011725293132329, "step": 2270}, {"loss": 1.838, "grad_norm": 0.37667879462242126, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2280}, {"loss": 1.7315, "grad_norm": 0.3539164066314697, "learning_rate": 0.0002, "epoch": 1.917922948073702, "step": 2290}, {"loss": 1.7589, "grad_norm": 0.40542101860046387, "learning_rate": 0.0002, "epoch": 1.9262981574539364, "step": 2300}, {"loss": 1.6795, "grad_norm": 0.37341606616973877, "learning_rate": 0.0002, "epoch": 1.934673366834171, "step": 2310}, {"loss": 1.7058, "grad_norm": 0.4011504352092743, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2320}, {"loss": 1.688, "grad_norm": 0.37934592366218567, "learning_rate": 0.0002, "epoch": 1.95142378559464, "step": 2330}, {"loss": 1.6699, "grad_norm": 0.32745009660720825, "learning_rate": 0.0002, "epoch": 1.9597989949748744, "step": 2340}, {"loss": 1.7673, "grad_norm": 0.38347750902175903, "learning_rate": 0.0002, "epoch": 1.968174204355109, "step": 2350}, {"loss": 1.7116, "grad_norm": 0.3945120871067047, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2360}, {"loss": 1.7559, "grad_norm": 0.4034058749675751, "learning_rate": 0.0002, "epoch": 1.984924623115578, "step": 2370}, {"loss": 1.7254, "grad_norm": 0.3546718955039978, "learning_rate": 0.0002, "epoch": 1.9932998324958124, "step": 2380}, {"eval_loss": 1.8061236143112183, "eval_runtime": 38.2113, "eval_samples_per_second": 13.478, "eval_steps_per_second": 1.701, "epoch": 2.0, "step": 2388}, {"loss": 1.7203, "grad_norm": 0.35184019804000854, "learning_rate": 0.0002, "epoch": 2.0016750418760467, "step": 2390}, {"loss": 1.6124, "grad_norm": 0.40416669845581055, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 2400}, {"loss": 1.6092, "grad_norm": 0.3824569880962372, "learning_rate": 0.0002, "epoch": 2.0184254606365157, "step": 2410}, {"loss": 1.641, "grad_norm": 0.42036163806915283, "learning_rate": 0.0002, "epoch": 2.0268006700167502, "step": 2420}, {"loss": 1.6176, "grad_norm": 0.40417996048927307, "learning_rate": 0.0002, "epoch": 2.0351758793969847, "step": 2430}, {"loss": 1.643, "grad_norm": 0.45298922061920166, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 2440}, {"loss": 1.653, "grad_norm": 0.48289841413497925, "learning_rate": 0.0002, "epoch": 2.0519262981574538, "step": 2450}, {"loss": 1.5275, "grad_norm": 0.43702399730682373, "learning_rate": 0.0002, "epoch": 2.0603015075376883, "step": 2460}, {"loss": 1.5825, "grad_norm": 0.49487054347991943, "learning_rate": 0.0002, "epoch": 2.0686767169179228, "step": 2470}, {"loss": 1.6552, "grad_norm": 0.40030500292778015, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 2480}, {"loss": 1.614, "grad_norm": 0.4664880037307739, "learning_rate": 0.0002, "epoch": 2.0854271356783918, "step": 2490}, {"loss": 1.6589, "grad_norm": 0.4111400842666626, "learning_rate": 0.0002, "epoch": 2.0938023450586263, "step": 2500}, {"loss": 1.5788, "grad_norm": 0.4155750572681427, "learning_rate": 0.0002, "epoch": 2.102177554438861, "step": 2510}, {"loss": 1.598, "grad_norm": 0.39257505536079407, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 2520}, {"loss": 1.65, "grad_norm": 0.4156777560710907, "learning_rate": 0.0002, "epoch": 2.11892797319933, "step": 2530}, {"loss": 1.6695, "grad_norm": 0.4025181233882904, "learning_rate": 0.0002, "epoch": 2.1273031825795643, "step": 2540}, {"loss": 1.6471, "grad_norm": 0.42347562313079834, "learning_rate": 0.0002, "epoch": 2.135678391959799, "step": 2550}, {"loss": 1.6014, "grad_norm": 0.47068294882774353, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 2560}, {"loss": 1.6468, "grad_norm": 0.44081777334213257, "learning_rate": 0.0002, "epoch": 2.152428810720268, "step": 2570}, {"loss": 1.641, "grad_norm": 0.44823798537254333, "learning_rate": 0.0002, "epoch": 2.1608040201005023, "step": 2580}, {"loss": 1.6287, "grad_norm": 0.40486326813697815, "learning_rate": 0.0002, "epoch": 2.169179229480737, "step": 2590}, {"loss": 1.6198, "grad_norm": 0.454236775636673, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 2600}, {"loss": 1.5885, "grad_norm": 0.42555344104766846, "learning_rate": 0.0002, "epoch": 2.185929648241206, "step": 2610}, {"loss": 1.6348, "grad_norm": 0.5607381463050842, "learning_rate": 0.0002, "epoch": 2.1943048576214403, "step": 2620}, {"loss": 1.6343, "grad_norm": 0.4095611870288849, "learning_rate": 0.0002, "epoch": 2.202680067001675, "step": 2630}, {"loss": 1.5584, "grad_norm": 0.419342577457428, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 2640}, {"loss": 1.5425, "grad_norm": 0.48541849851608276, "learning_rate": 0.0002, "epoch": 2.219430485762144, "step": 2650}, {"loss": 1.6233, "grad_norm": 0.4365246891975403, "learning_rate": 0.0002, "epoch": 2.2278056951423784, "step": 2660}, {"loss": 1.6886, "grad_norm": 0.46417000889778137, "learning_rate": 0.0002, "epoch": 2.236180904522613, "step": 2670}, {"loss": 1.6345, "grad_norm": 0.5034580230712891, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 2680}, {"loss": 1.5992, "grad_norm": 0.44852879643440247, "learning_rate": 0.0002, "epoch": 2.2529313232830823, "step": 2690}, {"loss": 1.6152, "grad_norm": 0.43886998295783997, "learning_rate": 0.0002, "epoch": 2.2613065326633164, "step": 2700}, {"loss": 1.6533, "grad_norm": 0.45762625336647034, "learning_rate": 0.0002, "epoch": 2.2696817420435513, "step": 2710}, {"loss": 1.5889, "grad_norm": 0.39429017901420593, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 2720}, {"loss": 1.6419, "grad_norm": 0.4420442581176758, "learning_rate": 0.0002, "epoch": 2.2864321608040203, "step": 2730}, {"loss": 1.6126, "grad_norm": 0.4327794015407562, "learning_rate": 0.0002, "epoch": 2.2948073701842544, "step": 2740}, {"loss": 1.6405, "grad_norm": 0.4303780198097229, "learning_rate": 0.0002, "epoch": 2.3031825795644894, "step": 2750}, {"loss": 1.6362, "grad_norm": 0.41379377245903015, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 2760}, {"loss": 1.6744, "grad_norm": 0.4821205735206604, "learning_rate": 0.0002, "epoch": 2.3199329983249584, "step": 2770}, {"loss": 1.6694, "grad_norm": 0.46232181787490845, "learning_rate": 0.0002, "epoch": 2.3283082077051924, "step": 2780}, {"loss": 1.6341, "grad_norm": 0.44937554001808167, "learning_rate": 0.0002, "epoch": 2.3366834170854274, "step": 2790}, {"loss": 1.6556, "grad_norm": 0.443250447511673, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 2800}, {"loss": 1.6874, "grad_norm": 0.4687805473804474, "learning_rate": 0.0002, "epoch": 2.3534338358458964, "step": 2810}, {"loss": 1.6445, "grad_norm": 0.435031920671463, "learning_rate": 0.0002, "epoch": 2.3618090452261304, "step": 2820}, {"loss": 1.6335, "grad_norm": 0.4949858784675598, "learning_rate": 0.0002, "epoch": 2.3701842546063654, "step": 2830}, {"loss": 1.6803, "grad_norm": 0.46349018812179565, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 2840}, {"loss": 1.6586, "grad_norm": 0.46377238631248474, "learning_rate": 0.0002, "epoch": 2.3869346733668344, "step": 2850}, {"loss": 1.5384, "grad_norm": 0.6111940741539001, "learning_rate": 0.0002, "epoch": 2.3953098827470685, "step": 2860}, {"loss": 1.6132, "grad_norm": 0.45090532302856445, "learning_rate": 0.0002, "epoch": 2.4036850921273034, "step": 2870}, {"loss": 1.6047, "grad_norm": 0.4762120842933655, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 2880}, {"loss": 1.6997, "grad_norm": 0.4397919774055481, "learning_rate": 0.0002, "epoch": 2.4204355108877724, "step": 2890}, {"loss": 1.6369, "grad_norm": 0.4765152335166931, "learning_rate": 0.0002, "epoch": 2.4288107202680065, "step": 2900}, {"loss": 1.5982, "grad_norm": 0.4347304403781891, "learning_rate": 0.0002, "epoch": 2.4371859296482414, "step": 2910}, {"loss": 1.6409, "grad_norm": 0.3918324410915375, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 2920}, {"loss": 1.5354, "grad_norm": 0.43932855129241943, "learning_rate": 0.0002, "epoch": 2.4539363484087104, "step": 2930}, {"loss": 1.6283, "grad_norm": 0.46946918964385986, "learning_rate": 0.0002, "epoch": 2.4623115577889445, "step": 2940}, {"loss": 1.6622, "grad_norm": 0.45169174671173096, "learning_rate": 0.0002, "epoch": 2.4706867671691795, "step": 2950}, {"loss": 1.6386, "grad_norm": 0.43488186597824097, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 2960}, {"loss": 1.6187, "grad_norm": 0.42297765612602234, "learning_rate": 0.0002, "epoch": 2.4874371859296485, "step": 2970}, {"loss": 1.5708, "grad_norm": 0.4546392560005188, "learning_rate": 0.0002, "epoch": 2.4958123953098825, "step": 2980}, {"loss": 1.5944, "grad_norm": 0.4236692488193512, "learning_rate": 0.0002, "epoch": 2.5041876046901175, "step": 2990}, {"loss": 1.6927, "grad_norm": 0.46421024203300476, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3000}, {"loss": 1.6686, "grad_norm": 0.5040220618247986, "learning_rate": 0.0002, "epoch": 2.5209380234505865, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4596138894557953, "learning_rate": 0.0002, "epoch": 2.5293132328308205, "step": 3020}, {"loss": 1.5936, "grad_norm": 0.4410228729248047, "learning_rate": 0.0002, "epoch": 2.5376884422110555, "step": 3030}, {"loss": 1.6336, "grad_norm": 0.553693413734436, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3040}, {"loss": 1.6377, "grad_norm": 0.41298043727874756, "learning_rate": 0.0002, "epoch": 2.5544388609715245, "step": 3050}, {"loss": 1.7196, "grad_norm": 0.4894513487815857, "learning_rate": 0.0002, "epoch": 2.5628140703517586, "step": 3060}, {"loss": 1.6106, "grad_norm": 0.5525603294372559, "learning_rate": 0.0002, "epoch": 2.5711892797319935, "step": 3070}, {"loss": 1.6089, "grad_norm": 0.5043630003929138, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3080}, {"loss": 1.5641, "grad_norm": 0.4690920412540436, "learning_rate": 0.0002, "epoch": 2.5879396984924625, "step": 3090}, {"loss": 1.6364, "grad_norm": 0.4358677566051483, "learning_rate": 0.0002, "epoch": 2.5963149078726966, "step": 3100}, {"loss": 1.6328, "grad_norm": 0.4621894061565399, "learning_rate": 0.0002, "epoch": 2.6046901172529315, "step": 3110}, {"loss": 1.7426, "grad_norm": 0.4639507532119751, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3120}, {"loss": 1.6492, "grad_norm": 0.45161309838294983, "learning_rate": 0.0002, "epoch": 2.6214405360134005, "step": 3130}, {"loss": 1.6221, "grad_norm": 0.49179261922836304, "learning_rate": 0.0002, "epoch": 2.6298157453936346, "step": 3140}, {"loss": 1.663, "grad_norm": 0.4739720821380615, "learning_rate": 0.0002, "epoch": 2.6381909547738696, "step": 3150}, {"loss": 1.616, "grad_norm": 0.468252956867218, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3160}, {"loss": 1.705, "grad_norm": 0.44691553711891174, "learning_rate": 0.0002, "epoch": 2.6549413735343386, "step": 3170}, {"loss": 1.6558, "grad_norm": 0.47537046670913696, "learning_rate": 0.0002, "epoch": 2.6633165829145726, "step": 3180}, {"loss": 1.6755, "grad_norm": 0.4445202052593231, "learning_rate": 0.0002, "epoch": 2.6716917922948076, "step": 3190}, {"loss": 1.6522, "grad_norm": 0.46785518527030945, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 3200}, {"loss": 1.6711, "grad_norm": 0.4807088077068329, "learning_rate": 0.0002, "epoch": 2.6884422110552766, "step": 3210}, {"loss": 1.6385, "grad_norm": 0.4547516703605652, "learning_rate": 0.0002, "epoch": 2.6968174204355106, "step": 3220}, {"loss": 1.6084, "grad_norm": 0.5200821161270142, "learning_rate": 0.0002, "epoch": 2.7051926298157456, "step": 3230}, {"loss": 1.6434, "grad_norm": 0.4915551245212555, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 3240}, {"loss": 1.6146, "grad_norm": 0.4324817955493927, "learning_rate": 0.0002, "epoch": 2.7219430485762146, "step": 3250}, {"loss": 1.6154, "grad_norm": 0.6290464997291565, "learning_rate": 0.0002, "epoch": 2.7303182579564487, "step": 3260}, {"loss": 1.611, "grad_norm": 0.42255541682243347, "learning_rate": 0.0002, "epoch": 2.7386934673366836, "step": 3270}, {"loss": 1.6345, "grad_norm": 0.47089505195617676, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 3280}, {"loss": 1.6357, "grad_norm": 0.4492960572242737, "learning_rate": 0.0002, "epoch": 2.7554438860971526, "step": 3290}, {"loss": 1.652, "grad_norm": 0.4711938202381134, "learning_rate": 0.0002, "epoch": 2.7638190954773867, "step": 3300}, {"loss": 1.6107, "grad_norm": 0.4635316729545593, "learning_rate": 0.0002, "epoch": 2.7721943048576216, "step": 3310}, {"loss": 1.6044, "grad_norm": 0.4207742512226105, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 3320}, {"loss": 1.6163, "grad_norm": 0.5545504093170166, "learning_rate": 0.0002, "epoch": 2.7889447236180906, "step": 3330}, {"loss": 1.6642, "grad_norm": 0.46976953744888306, "learning_rate": 0.0002, "epoch": 2.7973199329983247, "step": 3340}, {"loss": 1.6879, "grad_norm": 0.4805937111377716, "learning_rate": 0.0002, "epoch": 2.8056951423785597, "step": 3350}, {"loss": 1.6185, "grad_norm": 0.4986467659473419, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 3360}, {"loss": 1.6125, "grad_norm": 0.44702932238578796, "learning_rate": 0.0002, "epoch": 2.8224455611390287, "step": 3370}, {"loss": 1.6318, "grad_norm": 0.4698854088783264, "learning_rate": 0.0002, "epoch": 2.8308207705192627, "step": 3380}, {"loss": 1.6468, "grad_norm": 0.5756528377532959, "learning_rate": 0.0002, "epoch": 2.8391959798994977, "step": 3390}, {"loss": 1.6783, "grad_norm": 0.4266531765460968, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 3400}, {"loss": 1.6351, "grad_norm": 0.5342442989349365, "learning_rate": 0.0002, "epoch": 2.8559463986599667, "step": 3410}, {"loss": 1.659, "grad_norm": 0.47210443019866943, "learning_rate": 0.0002, "epoch": 2.8643216080402008, "step": 3420}, {"loss": 1.6157, "grad_norm": 0.4491795599460602, "learning_rate": 0.0002, "epoch": 2.8726968174204357, "step": 3430}, {"loss": 1.6179, "grad_norm": 0.5387647151947021, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 3440}, {"loss": 1.6415, "grad_norm": 0.5059208273887634, "learning_rate": 0.0002, "epoch": 2.8894472361809047, "step": 3450}, {"loss": 1.6577, "grad_norm": 0.472605437040329, "learning_rate": 0.0002, "epoch": 2.8978224455611388, "step": 3460}, {"loss": 1.6831, "grad_norm": 0.499795138835907, "learning_rate": 0.0002, "epoch": 2.9061976549413737, "step": 3470}, {"loss": 1.6198, "grad_norm": 0.4887969493865967, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4670022130012512, "learning_rate": 0.0002, "epoch": 2.9229480737018427, "step": 3490}, {"loss": 1.6355, "grad_norm": 0.4475444555282593, "learning_rate": 0.0002, "epoch": 2.931323283082077, "step": 3500}, {"loss": 1.6669, "grad_norm": 0.39244669675827026, "learning_rate": 0.0002, "epoch": 2.9396984924623117, "step": 3510}, {"loss": 1.6094, "grad_norm": 0.4905056059360504, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 3520}, {"loss": 1.5774, "grad_norm": 0.4395551085472107, "learning_rate": 0.0002, "epoch": 2.9564489112227808, "step": 3530}, {"loss": 1.6047, "grad_norm": 0.4693661034107208, "learning_rate": 0.0002, "epoch": 2.964824120603015, "step": 3540}, {"loss": 1.648, "grad_norm": 0.473781943321228, "learning_rate": 0.0002, "epoch": 2.9731993299832498, "step": 3550}, {"loss": 1.7056, "grad_norm": 0.4374050796031952, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 3560}, {"loss": 1.6816, "grad_norm": 0.46144190430641174, "learning_rate": 0.0002, "epoch": 2.9899497487437188, "step": 3570}, {"loss": 1.5454, "grad_norm": 0.43887680768966675, "learning_rate": 0.0002, "epoch": 2.998324958123953, "step": 3580}, {"eval_loss": 1.8283122777938843, "eval_runtime": 38.023, "eval_samples_per_second": 13.544, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 3582}, {"loss": 1.5874, "grad_norm": 0.6784713268280029, "learning_rate": 0.0002, "epoch": 3.006700167504188, "step": 3590}, {"loss": 1.5813, "grad_norm": 0.5783940553665161, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 3600}, {"loss": 1.4769, "grad_norm": 0.5408937335014343, "learning_rate": 0.0002, "epoch": 3.023450586264657, "step": 3610}, {"loss": 1.526, "grad_norm": 0.5229013562202454, "learning_rate": 0.0002, "epoch": 3.0318257956448913, "step": 3620}, {"loss": 1.4835, "grad_norm": 0.49160143733024597, "learning_rate": 0.0002, "epoch": 3.040201005025126, "step": 3630}, {"loss": 1.5398, "grad_norm": 0.6563201546669006, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 3640}, {"loss": 1.448, "grad_norm": 0.5686020851135254, "learning_rate": 0.0002, "epoch": 3.056951423785595, "step": 3650}, {"loss": 1.4541, "grad_norm": 0.5774043202400208, "learning_rate": 0.0002, "epoch": 3.0653266331658293, "step": 3660}, {"loss": 1.4734, "grad_norm": 0.6106171011924744, "learning_rate": 0.0002, "epoch": 3.073701842546064, "step": 3670}, {"loss": 1.4961, "grad_norm": 0.517433226108551, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 3680}, {"loss": 1.4961, "grad_norm": 0.5681702494621277, "learning_rate": 0.0002, "epoch": 3.090452261306533, "step": 3690}, {"loss": 1.4731, "grad_norm": 0.5769233107566833, "learning_rate": 0.0002, "epoch": 3.0988274706867673, "step": 3700}, {"loss": 1.4836, "grad_norm": 0.5657462477684021, "learning_rate": 0.0002, "epoch": 3.107202680067002, "step": 3710}, {"loss": 1.4526, "grad_norm": 0.6035246253013611, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 3720}, {"loss": 1.5102, "grad_norm": 0.7286643385887146, "learning_rate": 0.0002, "epoch": 3.123953098827471, "step": 3730}, {"loss": 1.4444, "grad_norm": 0.5121201872825623, "learning_rate": 0.0002, "epoch": 3.1323283082077054, "step": 3740}, {"loss": 1.565, "grad_norm": 0.5074213147163391, "learning_rate": 0.0002, "epoch": 3.14070351758794, "step": 3750}, {"loss": 1.4729, "grad_norm": 0.57481849193573, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 3760}, {"loss": 1.4765, "grad_norm": 0.6326663494110107, "learning_rate": 0.0002, "epoch": 3.157453936348409, "step": 3770}, {"loss": 1.4888, "grad_norm": 0.6039315462112427, "learning_rate": 0.0002, "epoch": 3.1658291457286434, "step": 3780}, {"loss": 1.5084, "grad_norm": 0.6936715245246887, "learning_rate": 0.0002, "epoch": 3.174204355108878, "step": 3790}, {"loss": 1.4879, "grad_norm": 0.6516796946525574, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 3800}, {"loss": 1.578, "grad_norm": 0.6140730977058411, "learning_rate": 0.0002, "epoch": 3.190954773869347, "step": 3810}, {"loss": 1.5101, "grad_norm": 0.631328284740448, "learning_rate": 0.0002, "epoch": 3.1993299832495814, "step": 3820}, {"loss": 1.4844, "grad_norm": 0.6265402436256409, "learning_rate": 0.0002, "epoch": 3.207705192629816, "step": 3830}, {"loss": 1.5332, "grad_norm": 0.6649428606033325, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 3840}, {"loss": 1.5231, "grad_norm": 0.5329259634017944, "learning_rate": 0.0002, "epoch": 3.224455611390285, "step": 3850}, {"loss": 1.5714, "grad_norm": 0.6008304953575134, "learning_rate": 0.0002, "epoch": 3.2328308207705194, "step": 3860}, {"loss": 1.5214, "grad_norm": 0.5918582081794739, "learning_rate": 0.0002, "epoch": 3.241206030150754, "step": 3870}, {"loss": 1.571, "grad_norm": 0.643622100353241, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 3880}, {"loss": 1.5274, "grad_norm": 0.5517964363098145, "learning_rate": 0.0002, "epoch": 3.257956448911223, "step": 3890}, {"loss": 1.5458, "grad_norm": 0.6780755519866943, "learning_rate": 0.0002, "epoch": 3.2663316582914574, "step": 3900}, {"loss": 1.5743, "grad_norm": 0.6742202639579773, "learning_rate": 0.0002, "epoch": 3.274706867671692, "step": 3910}, {"loss": 1.5279, "grad_norm": 0.6228749752044678, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 3920}, {"loss": 1.4899, "grad_norm": 0.5836303234100342, "learning_rate": 0.0002, "epoch": 3.291457286432161, "step": 3930}, {"loss": 1.5445, "grad_norm": 0.6337724328041077, "learning_rate": 0.0002, "epoch": 3.2998324958123955, "step": 3940}, {"loss": 1.5618, "grad_norm": 0.6345084309577942, "learning_rate": 0.0002, "epoch": 3.30820770519263, "step": 3950}, {"loss": 1.4224, "grad_norm": 0.6125303506851196, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 3960}, {"loss": 1.5355, "grad_norm": 0.6259911060333252, "learning_rate": 0.0002, "epoch": 3.324958123953099, "step": 3970}, {"loss": 1.5427, "grad_norm": 0.645745575428009, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 3980}, {"loss": 1.5817, "grad_norm": 0.6666176915168762, "learning_rate": 0.0002, "epoch": 3.341708542713568, "step": 3990}, {"loss": 1.4998, "grad_norm": 0.59013831615448, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 4000}, {"loss": 1.4921, "grad_norm": 0.6604634523391724, "learning_rate": 0.0002, "epoch": 3.358458961474037, "step": 4010}, {"loss": 1.5076, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 3.3668341708542715, "step": 4020}, {"loss": 1.4801, "grad_norm": 0.515724778175354, "learning_rate": 0.0002, "epoch": 3.375209380234506, "step": 4030}, {"loss": 1.4932, "grad_norm": 0.681968092918396, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 4040}, {"loss": 1.5148, "grad_norm": 0.5978158116340637, "learning_rate": 0.0002, "epoch": 3.391959798994975, "step": 4050}, {"loss": 1.5449, "grad_norm": 0.6043432354927063, "learning_rate": 0.0002, "epoch": 3.4003350083752095, "step": 4060}, {"loss": 1.5021, "grad_norm": 0.5899770855903625, "learning_rate": 0.0002, "epoch": 3.408710217755444, "step": 4070}, {"loss": 1.5992, "grad_norm": 0.6014242172241211, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 4080}, {"loss": 1.4692, "grad_norm": 0.5944811105728149, "learning_rate": 0.0002, "epoch": 3.425460636515913, "step": 4090}, {"loss": 1.5877, "grad_norm": 0.6506822109222412, "learning_rate": 0.0002, "epoch": 3.4338358458961475, "step": 4100}, {"loss": 1.5144, "grad_norm": 0.6926528811454773, "learning_rate": 0.0002, "epoch": 3.442211055276382, "step": 4110}, {"loss": 1.5169, "grad_norm": 0.5646378993988037, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 4120}, {"loss": 1.5032, "grad_norm": 0.7233654856681824, "learning_rate": 0.0002, "epoch": 3.458961474036851, "step": 4130}, {"loss": 1.5161, "grad_norm": 0.6231815814971924, "learning_rate": 0.0002, "epoch": 3.4673366834170856, "step": 4140}, {"loss": 1.5349, "grad_norm": 0.6115689873695374, "learning_rate": 0.0002, "epoch": 3.47571189279732, "step": 4150}, {"loss": 1.4621, "grad_norm": 0.5812674760818481, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 4160}, {"loss": 1.5465, "grad_norm": 0.6099632978439331, "learning_rate": 0.0002, "epoch": 3.492462311557789, "step": 4170}, {"loss": 1.4795, "grad_norm": 0.6102647185325623, "learning_rate": 0.0002, "epoch": 3.5008375209380236, "step": 4180}, {"loss": 1.5305, "grad_norm": 0.6034680008888245, "learning_rate": 0.0002, "epoch": 3.509212730318258, "step": 4190}, {"loss": 1.5093, "grad_norm": 0.6281666159629822, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 4200}, {"loss": 1.4903, "grad_norm": 0.6245372295379639, "learning_rate": 0.0002, "epoch": 3.525963149078727, "step": 4210}, {"loss": 1.5098, "grad_norm": 0.5897293090820312, "learning_rate": 0.0002, "epoch": 3.5343383584589616, "step": 4220}, {"loss": 1.5991, "grad_norm": 0.601054847240448, "learning_rate": 0.0002, "epoch": 3.542713567839196, "step": 4230}, {"loss": 1.4974, "grad_norm": 0.7004473805427551, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 4240}, {"loss": 1.5993, "grad_norm": 0.6601553559303284, "learning_rate": 0.0002, "epoch": 3.559463986599665, "step": 4250}, {"loss": 1.4961, "grad_norm": 0.6112467050552368, "learning_rate": 0.0002, "epoch": 3.5678391959798996, "step": 4260}, {"loss": 1.4967, "grad_norm": 0.5902454853057861, "learning_rate": 0.0002, "epoch": 3.576214405360134, "step": 4270}, {"loss": 1.5659, "grad_norm": 0.5792450904846191, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 4280}, {"loss": 1.4664, "grad_norm": 0.5923888087272644, "learning_rate": 0.0002, "epoch": 3.592964824120603, "step": 4290}, {"loss": 1.5155, "grad_norm": 0.5869482159614563, "learning_rate": 0.0002, "epoch": 3.6013400335008376, "step": 4300}, {"loss": 1.5119, "grad_norm": 0.6372929811477661, "learning_rate": 0.0002, "epoch": 3.609715242881072, "step": 4310}, {"loss": 1.4977, "grad_norm": 0.6350686550140381, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 4320}, {"loss": 1.5226, "grad_norm": 0.571819007396698, "learning_rate": 0.0002, "epoch": 3.626465661641541, "step": 4330}, {"loss": 1.5414, "grad_norm": 0.592250645160675, "learning_rate": 0.0002, "epoch": 3.6348408710217757, "step": 4340}, {"loss": 1.4912, "grad_norm": 0.6110650897026062, "learning_rate": 0.0002, "epoch": 3.64321608040201, "step": 4350}, {"loss": 1.6089, "grad_norm": 0.6187081336975098, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 4360}, {"loss": 1.5345, "grad_norm": 0.6197671890258789, "learning_rate": 0.0002, "epoch": 3.659966499162479, "step": 4370}, {"loss": 1.4988, "grad_norm": 0.6050862669944763, "learning_rate": 0.0002, "epoch": 3.6683417085427137, "step": 4380}, {"loss": 1.4872, "grad_norm": 0.621265172958374, "learning_rate": 0.0002, "epoch": 3.676716917922948, "step": 4390}, {"loss": 1.6011, "grad_norm": 0.6552940011024475, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 4400}, {"loss": 1.4344, "grad_norm": 0.5638861060142517, "learning_rate": 0.0002, "epoch": 3.693467336683417, "step": 4410}, {"loss": 1.4985, "grad_norm": 0.6388863325119019, "learning_rate": 0.0002, "epoch": 3.7018425460636517, "step": 4420}, {"loss": 1.3696, "grad_norm": 0.6062559485435486, "learning_rate": 0.0002, "epoch": 3.710217755443886, "step": 4430}, {"loss": 1.5101, "grad_norm": 0.5800350308418274, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 4440}, {"loss": 1.5286, "grad_norm": 0.5954474210739136, "learning_rate": 0.0002, "epoch": 3.726968174204355, "step": 4450}, {"loss": 1.6133, "grad_norm": 0.5880125761032104, "learning_rate": 0.0002, "epoch": 3.7353433835845897, "step": 4460}, {"loss": 1.5055, "grad_norm": 0.5880921483039856, "learning_rate": 0.0002, "epoch": 3.7437185929648242, "step": 4470}, {"loss": 1.5728, "grad_norm": 0.5995073914527893, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 4480}, {"loss": 1.554, "grad_norm": 0.5958493947982788, "learning_rate": 0.0002, "epoch": 3.7604690117252932, "step": 4490}, {"loss": 1.5472, "grad_norm": 0.5694711804389954, "learning_rate": 0.0002, "epoch": 3.7688442211055277, "step": 4500}, {"loss": 1.5105, "grad_norm": 0.6175141930580139, "learning_rate": 0.0002, "epoch": 3.7772194304857623, "step": 4510}, {"loss": 1.5404, "grad_norm": 0.5541581511497498, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 4520}, {"loss": 1.5283, "grad_norm": 0.5986164808273315, "learning_rate": 0.0002, "epoch": 3.7939698492462313, "step": 4530}, {"loss": 1.4961, "grad_norm": 0.640072226524353, "learning_rate": 0.0002, "epoch": 3.8023450586264658, "step": 4540}, {"loss": 1.5297, "grad_norm": 0.5742579698562622, "learning_rate": 0.0002, "epoch": 3.8107202680067003, "step": 4550}, {"loss": 1.5591, "grad_norm": 0.6658656001091003, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 4560}, {"loss": 1.4992, "grad_norm": 0.5475369691848755, "learning_rate": 0.0002, "epoch": 3.8274706867671693, "step": 4570}, {"loss": 1.5966, "grad_norm": 0.613172173500061, "learning_rate": 0.0002, "epoch": 3.835845896147404, "step": 4580}, {"loss": 1.5594, "grad_norm": 0.590968132019043, "learning_rate": 0.0002, "epoch": 3.8442211055276383, "step": 4590}, {"loss": 1.5067, "grad_norm": 0.5865461826324463, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 4600}, {"loss": 1.5247, "grad_norm": 0.6815178990364075, "learning_rate": 0.0002, "epoch": 3.8609715242881073, "step": 4610}, {"loss": 1.5702, "grad_norm": 0.6551400423049927, "learning_rate": 0.0002, "epoch": 3.869346733668342, "step": 4620}, {"loss": 1.4891, "grad_norm": 0.6398897171020508, "learning_rate": 0.0002, "epoch": 3.8777219430485763, "step": 4630}, {"loss": 1.5353, "grad_norm": 0.6761762499809265, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 4640}, {"loss": 1.6071, "grad_norm": 0.6277294754981995, "learning_rate": 0.0002, "epoch": 3.8944723618090453, "step": 4650}, {"loss": 1.5605, "grad_norm": 0.6285301446914673, "learning_rate": 0.0002, "epoch": 3.90284757118928, "step": 4660}, {"loss": 1.5937, "grad_norm": 0.5416069626808167, "learning_rate": 0.0002, "epoch": 3.9112227805695143, "step": 4670}, {"loss": 1.5461, "grad_norm": 0.6314545273780823, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 4680}, {"loss": 1.4828, "grad_norm": 0.604479968547821, "learning_rate": 0.0002, "epoch": 3.9279731993299833, "step": 4690}, {"loss": 1.5186, "grad_norm": 0.5321660041809082, "learning_rate": 0.0002, "epoch": 3.936348408710218, "step": 4700}, {"loss": 1.4696, "grad_norm": 0.6632516980171204, "learning_rate": 0.0002, "epoch": 3.9447236180904524, "step": 4710}, {"loss": 1.519, "grad_norm": 0.5925896763801575, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 4720}, {"loss": 1.5716, "grad_norm": 0.6580308675765991, "learning_rate": 0.0002, "epoch": 3.9614740368509214, "step": 4730}, {"loss": 1.4462, "grad_norm": 0.5578170418739319, "learning_rate": 0.0002, "epoch": 3.969849246231156, "step": 4740}, {"loss": 1.5394, "grad_norm": 0.6216608285903931, "learning_rate": 0.0002, "epoch": 3.9782244556113904, "step": 4750}, {"loss": 1.5395, "grad_norm": 0.5693069696426392, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 4760}, {"loss": 1.5517, "grad_norm": 0.5353434681892395, "learning_rate": 0.0002, "epoch": 3.9949748743718594, "step": 4770}, {"eval_loss": 1.8809821605682373, "eval_runtime": 37.9695, "eval_samples_per_second": 13.564, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4776}, {"loss": 1.4608, "grad_norm": 0.6117817759513855, "learning_rate": 0.0002, "epoch": 4.0033500837520934, "step": 4780}, {"loss": 1.2982, "grad_norm": 0.6816073656082153, "learning_rate": 0.0002, "epoch": 4.011725293132328, "step": 4790}, {"loss": 1.3464, "grad_norm": 0.715548038482666, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 4800}, {"loss": 1.3918, "grad_norm": 0.8585814833641052, "learning_rate": 0.0002, "epoch": 4.028475711892797, "step": 4810}, {"loss": 1.4137, "grad_norm": 0.7372158765792847, "learning_rate": 0.0002, "epoch": 4.0368509212730315, "step": 4820}, {"loss": 1.3769, "grad_norm": 0.8915117979049683, "learning_rate": 0.0002, "epoch": 4.045226130653266, "step": 4830}, {"loss": 1.3551, "grad_norm": 0.9323588013648987, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 4840}, {"loss": 1.3687, "grad_norm": 0.9298437237739563, "learning_rate": 0.0002, "epoch": 4.061976549413735, "step": 4850}, {"loss": 1.4173, "grad_norm": 0.8541792035102844, "learning_rate": 0.0002, "epoch": 4.0703517587939695, "step": 4860}, {"loss": 1.3668, "grad_norm": 0.7833571434020996, "learning_rate": 0.0002, "epoch": 4.078726968174204, "step": 4870}, {"loss": 1.3835, "grad_norm": 0.9325295090675354, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 4880}, {"loss": 1.3834, "grad_norm": 0.7066370248794556, "learning_rate": 0.0002, "epoch": 4.0954773869346734, "step": 4890}, {"loss": 1.3661, "grad_norm": 0.712640643119812, "learning_rate": 0.0002, "epoch": 4.1038525963149075, "step": 4900}, {"loss": 1.3637, "grad_norm": 0.6970218420028687, "learning_rate": 0.0002, "epoch": 4.1122278056951425, "step": 4910}, {"loss": 1.3805, "grad_norm": 0.7979312539100647, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 4920}, {"loss": 1.4115, "grad_norm": 0.7801558375358582, "learning_rate": 0.0002, "epoch": 4.1289782244556115, "step": 4930}, {"loss": 1.3288, "grad_norm": 0.7505159974098206, "learning_rate": 0.0002, "epoch": 4.1373534338358455, "step": 4940}, {"loss": 1.3453, "grad_norm": 0.738201916217804, "learning_rate": 0.0002, "epoch": 4.1457286432160805, "step": 4950}, {"loss": 1.3418, "grad_norm": 0.7736659049987793, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 4960}, {"loss": 1.3663, "grad_norm": 0.7850064635276794, "learning_rate": 0.0002, "epoch": 4.1624790619765495, "step": 4970}, {"loss": 1.326, "grad_norm": 0.8316620588302612, "learning_rate": 0.0002, "epoch": 4.1708542713567835, "step": 4980}, {"loss": 1.377, "grad_norm": 0.7217330932617188, "learning_rate": 0.0002, "epoch": 4.1792294807370185, "step": 4990}, {"loss": 1.3299, "grad_norm": 0.7050199508666992, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 5000}, {"loss": 1.3798, "grad_norm": 0.6992659568786621, "learning_rate": 0.0002, "epoch": 4.1959798994974875, "step": 5010}, {"loss": 1.3391, "grad_norm": 0.7648445963859558, "learning_rate": 0.0002, "epoch": 4.204355108877722, "step": 5020}, {"loss": 1.3339, "grad_norm": 0.8093137741088867, "learning_rate": 0.0002, "epoch": 4.2127303182579565, "step": 5030}, {"loss": 1.37, "grad_norm": 0.6907750368118286, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 5040}, {"loss": 1.4231, "grad_norm": 0.7000078558921814, "learning_rate": 0.0002, "epoch": 4.2294807370184255, "step": 5050}, {"loss": 1.3411, "grad_norm": 0.715034008026123, "learning_rate": 0.0002, "epoch": 4.23785594639866, "step": 5060}, {"loss": 1.3795, "grad_norm": 0.828895628452301, "learning_rate": 0.0002, "epoch": 4.2462311557788945, "step": 5070}, {"loss": 1.3397, "grad_norm": 0.7127292156219482, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 5080}, {"loss": 1.4255, "grad_norm": 0.8256623148918152, "learning_rate": 0.0002, "epoch": 4.2629815745393635, "step": 5090}, {"loss": 1.4078, "grad_norm": 0.8062452077865601, "learning_rate": 0.0002, "epoch": 4.271356783919598, "step": 5100}, {"loss": 1.3705, "grad_norm": 0.6861081123352051, "learning_rate": 0.0002, "epoch": 4.279731993299833, "step": 5110}, {"loss": 1.3463, "grad_norm": 0.7566041350364685, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 5120}, {"loss": 1.4571, "grad_norm": 0.8734753727912903, "learning_rate": 0.0002, "epoch": 4.296482412060302, "step": 5130}, {"loss": 1.4747, "grad_norm": 0.8559320569038391, "learning_rate": 0.0002, "epoch": 4.304857621440536, "step": 5140}, {"loss": 1.3551, "grad_norm": 0.6965576410293579, "learning_rate": 0.0002, "epoch": 4.313232830820771, "step": 5150}, {"loss": 1.3485, "grad_norm": 0.8277813792228699, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 5160}, {"loss": 1.3433, "grad_norm": 1.0733633041381836, "learning_rate": 0.0002, "epoch": 4.32998324958124, "step": 5170}, {"loss": 1.3953, "grad_norm": 0.7914809584617615, "learning_rate": 0.0002, "epoch": 4.338358458961474, "step": 5180}, {"loss": 1.3907, "grad_norm": 0.8307849168777466, "learning_rate": 0.0002, "epoch": 4.346733668341709, "step": 5190}, {"loss": 1.4318, "grad_norm": 0.7066516280174255, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 5200}, {"loss": 1.3866, "grad_norm": 0.9676792025566101, "learning_rate": 0.0002, "epoch": 4.363484087102178, "step": 5210}, {"loss": 1.3973, "grad_norm": 0.7672301530838013, "learning_rate": 0.0002, "epoch": 4.371859296482412, "step": 5220}, {"loss": 1.3576, "grad_norm": 0.6888260245323181, "learning_rate": 0.0002, "epoch": 4.380234505862647, "step": 5230}, {"loss": 1.3815, "grad_norm": 0.8775295615196228, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 5240}, {"loss": 1.3224, "grad_norm": 0.8742642998695374, "learning_rate": 0.0002, "epoch": 4.396984924623116, "step": 5250}, {"loss": 1.4609, "grad_norm": 0.6935433745384216, "learning_rate": 0.0002, "epoch": 4.40536013400335, "step": 5260}, {"loss": 1.3605, "grad_norm": 0.7726178169250488, "learning_rate": 0.0002, "epoch": 4.413735343383585, "step": 5270}, {"loss": 1.4591, "grad_norm": 0.7493860721588135, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 5280}, {"loss": 1.3277, "grad_norm": 0.7758517265319824, "learning_rate": 0.0002, "epoch": 4.430485762144054, "step": 5290}, {"loss": 1.2916, "grad_norm": 0.779315173625946, "learning_rate": 0.0002, "epoch": 4.438860971524288, "step": 5300}, {"loss": 1.4483, "grad_norm": 0.7753667235374451, "learning_rate": 0.0002, "epoch": 4.447236180904523, "step": 5310}, {"loss": 1.2513, "grad_norm": 0.8738188743591309, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 5320}, {"loss": 1.41, "grad_norm": 0.8410757184028625, "learning_rate": 0.0002, "epoch": 4.463986599664992, "step": 5330}, {"loss": 1.3809, "grad_norm": 0.728897750377655, "learning_rate": 0.0002, "epoch": 4.472361809045226, "step": 5340}, {"loss": 1.4049, "grad_norm": 0.7880531549453735, "learning_rate": 0.0002, "epoch": 4.480737018425461, "step": 5350}, {"loss": 1.4106, "grad_norm": 0.8455142378807068, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 5360}, {"loss": 1.431, "grad_norm": 0.8527868986129761, "learning_rate": 0.0002, "epoch": 4.49748743718593, "step": 5370}, {"loss": 1.3586, "grad_norm": 0.7743009328842163, "learning_rate": 0.0002, "epoch": 4.505862646566165, "step": 5380}, {"loss": 1.4175, "grad_norm": 0.7555320858955383, "learning_rate": 0.0002, "epoch": 4.514237855946399, "step": 5390}, {"loss": 1.3433, "grad_norm": 0.8146619200706482, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 5400}, {"loss": 1.4859, "grad_norm": 0.8042502999305725, "learning_rate": 0.0002, "epoch": 4.530988274706868, "step": 5410}, {"loss": 1.3843, "grad_norm": 0.7329140305519104, "learning_rate": 0.0002, "epoch": 4.539363484087103, "step": 5420}, {"loss": 1.3946, "grad_norm": 0.7574753165245056, "learning_rate": 0.0002, "epoch": 4.547738693467337, "step": 5430}, {"loss": 1.3048, "grad_norm": 1.1223409175872803, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 5440}, {"loss": 1.4067, "grad_norm": 0.7647369503974915, "learning_rate": 0.0002, "epoch": 4.564489112227806, "step": 5450}, {"loss": 1.4569, "grad_norm": 0.9135531187057495, "learning_rate": 0.0002, "epoch": 4.572864321608041, "step": 5460}, {"loss": 1.4813, "grad_norm": 0.9343693852424622, "learning_rate": 0.0002, "epoch": 4.581239530988275, "step": 5470}, {"loss": 1.385, "grad_norm": 0.869945764541626, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 5480}, {"loss": 1.4067, "grad_norm": 0.7383785843849182, "learning_rate": 0.0002, "epoch": 4.597989949748744, "step": 5490}, {"loss": 1.3698, "grad_norm": 0.7988699674606323, "learning_rate": 0.0002, "epoch": 4.606365159128979, "step": 5500}, {"loss": 1.3834, "grad_norm": 0.8731256127357483, "learning_rate": 0.0002, "epoch": 4.614740368509213, "step": 5510}, {"loss": 1.4393, "grad_norm": 0.7577664256095886, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 5520}, {"loss": 1.4418, "grad_norm": 0.7825039625167847, "learning_rate": 0.0002, "epoch": 4.631490787269682, "step": 5530}, {"loss": 1.4594, "grad_norm": 0.8534902930259705, "learning_rate": 0.0002, "epoch": 4.639865996649917, "step": 5540}, {"loss": 1.3689, "grad_norm": 0.7403318285942078, "learning_rate": 0.0002, "epoch": 4.648241206030151, "step": 5550}, {"loss": 1.4456, "grad_norm": 0.8229990005493164, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 5560}, {"loss": 1.3854, "grad_norm": 0.8279513716697693, "learning_rate": 0.0002, "epoch": 4.66499162479062, "step": 5570}, {"loss": 1.4472, "grad_norm": 0.8923851251602173, "learning_rate": 0.0002, "epoch": 4.673366834170855, "step": 5580}, {"loss": 1.3999, "grad_norm": 0.7457540035247803, "learning_rate": 0.0002, "epoch": 4.681742043551089, "step": 5590}, {"loss": 1.4341, "grad_norm": 0.7110715508460999, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 5600}, {"loss": 1.4327, "grad_norm": 0.7135499119758606, "learning_rate": 0.0002, "epoch": 4.698492462311558, "step": 5610}, {"loss": 1.4321, "grad_norm": 0.7606837153434753, "learning_rate": 0.0002, "epoch": 4.706867671691793, "step": 5620}, {"loss": 1.3792, "grad_norm": 0.9622916579246521, "learning_rate": 0.0002, "epoch": 4.715242881072027, "step": 5630}, {"loss": 1.4, "grad_norm": 0.7665684819221497, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 5640}, {"loss": 1.3837, "grad_norm": 0.7985475659370422, "learning_rate": 0.0002, "epoch": 4.731993299832496, "step": 5650}, {"loss": 1.397, "grad_norm": 0.9179279208183289, "learning_rate": 0.0002, "epoch": 4.740368509212731, "step": 5660}, {"loss": 1.4379, "grad_norm": 0.8311634063720703, "learning_rate": 0.0002, "epoch": 4.748743718592965, "step": 5670}, {"loss": 1.3546, "grad_norm": 0.7773269414901733, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 5680}, {"loss": 1.4031, "grad_norm": 0.7771748900413513, "learning_rate": 0.0002, "epoch": 4.765494137353434, "step": 5690}, {"loss": 1.3724, "grad_norm": 0.7518507242202759, "learning_rate": 0.0002, "epoch": 4.773869346733669, "step": 5700}, {"loss": 1.3247, "grad_norm": 0.7699326276779175, "learning_rate": 0.0002, "epoch": 4.782244556113903, "step": 5710}, {"loss": 1.437, "grad_norm": 0.7001115679740906, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 5720}, {"loss": 1.4257, "grad_norm": 0.7220682501792908, "learning_rate": 0.0002, "epoch": 4.798994974874372, "step": 5730}, {"loss": 1.4174, "grad_norm": 0.7654005289077759, "learning_rate": 0.0002, "epoch": 4.807370184254607, "step": 5740}, {"loss": 1.3792, "grad_norm": 0.8132795095443726, "learning_rate": 0.0002, "epoch": 4.815745393634841, "step": 5750}, {"loss": 1.4007, "grad_norm": 0.7105404138565063, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 5760}, {"loss": 1.4289, "grad_norm": 0.9346209764480591, "learning_rate": 0.0002, "epoch": 4.83249581239531, "step": 5770}, {"loss": 1.4066, "grad_norm": 1.0075623989105225, "learning_rate": 0.0002, "epoch": 4.840871021775545, "step": 5780}, {"loss": 1.4558, "grad_norm": 0.758376955986023, "learning_rate": 0.0002, "epoch": 4.849246231155779, "step": 5790}, {"loss": 1.4117, "grad_norm": 0.854821503162384, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 5800}, {"loss": 1.4014, "grad_norm": 0.8226943016052246, "learning_rate": 0.0002, "epoch": 4.865996649916248, "step": 5810}, {"loss": 1.3963, "grad_norm": 0.7510473728179932, "learning_rate": 0.0002, "epoch": 4.874371859296483, "step": 5820}, {"loss": 1.4463, "grad_norm": 0.7449678182601929, "learning_rate": 0.0002, "epoch": 4.882747068676717, "step": 5830}, {"loss": 1.3691, "grad_norm": 0.7840824723243713, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 5840}, {"loss": 1.3795, "grad_norm": 0.8811169862747192, "learning_rate": 0.0002, "epoch": 4.899497487437186, "step": 5850}, {"loss": 1.3827, "grad_norm": 0.84914630651474, "learning_rate": 0.0002, "epoch": 4.907872696817421, "step": 5860}, {"loss": 1.4549, "grad_norm": 0.7514461874961853, "learning_rate": 0.0002, "epoch": 4.916247906197655, "step": 5870}, {"loss": 1.3633, "grad_norm": 0.7229002118110657, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 5880}, {"loss": 1.4302, "grad_norm": 0.9418245553970337, "learning_rate": 0.0002, "epoch": 4.932998324958124, "step": 5890}, {"loss": 1.4747, "grad_norm": 0.7626827359199524, "learning_rate": 0.0002, "epoch": 4.941373534338359, "step": 5900}, {"loss": 1.4462, "grad_norm": 0.7711105346679688, "learning_rate": 0.0002, "epoch": 4.949748743718593, "step": 5910}, {"loss": 1.4104, "grad_norm": 0.8689648509025574, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 5920}, {"loss": 1.4273, "grad_norm": 0.7873271107673645, "learning_rate": 0.0002, "epoch": 4.966499162479062, "step": 5930}, {"loss": 1.4361, "grad_norm": 0.7637495994567871, "learning_rate": 0.0002, "epoch": 4.974874371859297, "step": 5940}, {"loss": 1.5037, "grad_norm": 0.9907955527305603, "learning_rate": 0.0002, "epoch": 4.983249581239531, "step": 5950}, {"loss": 1.4476, "grad_norm": 0.7827328443527222, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 5960}, {"loss": 1.4252, "grad_norm": 0.818544328212738, "learning_rate": 0.0002, "epoch": 5.0, "step": 5970}, {"eval_loss": 1.9436752796173096, "eval_runtime": 38.087, "eval_samples_per_second": 13.522, "eval_steps_per_second": 1.707, "epoch": 5.0, "step": 5970}, {"loss": 1.2367, "grad_norm": 1.1248953342437744, "learning_rate": 0.0002, "epoch": 5.008375209380235, "step": 5980}, {"loss": 1.2221, "grad_norm": 0.9285888075828552, "learning_rate": 0.0002, "epoch": 5.016750418760469, "step": 5990}, {"loss": 1.263, "grad_norm": 0.8626338839530945, "learning_rate": 0.0002, "epoch": 5.025125628140704, "step": 6000}, {"loss": 1.1839, "grad_norm": 0.8253921270370483, "learning_rate": 0.0002, "epoch": 5.033500837520938, "step": 6010}, {"loss": 1.2773, "grad_norm": 1.079628586769104, "learning_rate": 0.0002, "epoch": 5.041876046901173, "step": 6020}, {"loss": 1.2419, "grad_norm": 0.902625322341919, "learning_rate": 0.0002, "epoch": 5.050251256281407, "step": 6030}, {"loss": 1.164, "grad_norm": 0.9593151211738586, "learning_rate": 0.0002, "epoch": 5.058626465661642, "step": 6040}, {"loss": 1.2442, "grad_norm": 0.9276060461997986, "learning_rate": 0.0002, "epoch": 5.067001675041876, "step": 6050}, {"loss": 1.2496, "grad_norm": 1.0472362041473389, "learning_rate": 0.0002, "epoch": 5.075376884422111, "step": 6060}, {"loss": 1.2241, "grad_norm": 0.9126865863800049, "learning_rate": 0.0002, "epoch": 5.083752093802345, "step": 6070}, {"loss": 1.1997, "grad_norm": 1.0797888040542603, "learning_rate": 0.0002, "epoch": 5.09212730318258, "step": 6080}, {"loss": 1.2299, "grad_norm": 0.9538877010345459, "learning_rate": 0.0002, "epoch": 5.100502512562814, "step": 6090}, {"loss": 1.2585, "grad_norm": 1.0604161024093628, "learning_rate": 0.0002, "epoch": 5.108877721943049, "step": 6100}, {"loss": 1.2627, "grad_norm": 1.0178192853927612, "learning_rate": 0.0002, "epoch": 5.117252931323283, "step": 6110}, {"loss": 1.2848, "grad_norm": 1.0262689590454102, "learning_rate": 0.0002, "epoch": 5.125628140703517, "step": 6120}, {"loss": 1.228, "grad_norm": 0.9046729803085327, "learning_rate": 0.0002, "epoch": 5.134003350083752, "step": 6130}, {"loss": 1.2051, "grad_norm": 1.1244608163833618, "learning_rate": 0.0002, "epoch": 5.142378559463987, "step": 6140}, {"loss": 1.2751, "grad_norm": 1.082835078239441, "learning_rate": 0.0002, "epoch": 5.150753768844221, "step": 6150}, {"loss": 1.1625, "grad_norm": 0.9078734517097473, "learning_rate": 0.0002, "epoch": 5.159128978224456, "step": 6160}, {"loss": 1.2122, "grad_norm": 1.0688848495483398, "learning_rate": 0.0002, "epoch": 5.16750418760469, "step": 6170}, {"loss": 1.2143, "grad_norm": 1.137519359588623, "learning_rate": 0.0002, "epoch": 5.175879396984925, "step": 6180}, {"loss": 1.3125, "grad_norm": 1.0728670358657837, "learning_rate": 0.0002, "epoch": 5.184254606365159, "step": 6190}, {"loss": 1.2352, "grad_norm": 1.2384949922561646, "learning_rate": 0.0002, "epoch": 5.192629815745394, "step": 6200}, {"loss": 1.2173, "grad_norm": 0.8391274809837341, "learning_rate": 0.0002, "epoch": 5.201005025125628, "step": 6210}, {"loss": 1.2179, "grad_norm": 0.8948764801025391, "learning_rate": 0.0002, "epoch": 5.209380234505863, "step": 6220}, {"loss": 1.2467, "grad_norm": 0.9568309783935547, "learning_rate": 0.0002, "epoch": 5.217755443886097, "step": 6230}, {"loss": 1.2761, "grad_norm": 1.0604485273361206, "learning_rate": 0.0002, "epoch": 5.226130653266332, "step": 6240}, {"loss": 1.1407, "grad_norm": 1.1278935670852661, "learning_rate": 0.0002, "epoch": 5.234505862646566, "step": 6250}, {"loss": 1.2332, "grad_norm": 0.9903607368469238, "learning_rate": 0.0002, "epoch": 5.242881072026801, "step": 6260}, {"loss": 1.2544, "grad_norm": 0.958718478679657, "learning_rate": 0.0002, "epoch": 5.251256281407035, "step": 6270}, {"loss": 1.2746, "grad_norm": 1.127510905265808, "learning_rate": 0.0002, "epoch": 5.259631490787269, "step": 6280}, {"loss": 1.2589, "grad_norm": 1.1683127880096436, "learning_rate": 0.0002, "epoch": 5.268006700167504, "step": 6290}, {"loss": 1.2959, "grad_norm": 1.0723326206207275, "learning_rate": 0.0002, "epoch": 5.276381909547739, "step": 6300}, {"loss": 1.2522, "grad_norm": 0.9285374283790588, "learning_rate": 0.0002, "epoch": 5.284757118927973, "step": 6310}, {"loss": 1.2539, "grad_norm": 0.9201741218566895, "learning_rate": 0.0002, "epoch": 5.293132328308207, "step": 6320}, {"loss": 1.1816, "grad_norm": 0.9606702923774719, "learning_rate": 0.0002, "epoch": 5.301507537688442, "step": 6330}, {"loss": 1.2928, "grad_norm": 1.107960820198059, "learning_rate": 0.0002, "epoch": 5.309882747068677, "step": 6340}, {"loss": 1.209, "grad_norm": 0.9342933297157288, "learning_rate": 0.0002, "epoch": 5.318257956448911, "step": 6350}, {"loss": 1.2023, "grad_norm": 0.9170576930046082, "learning_rate": 0.0002, "epoch": 5.326633165829146, "step": 6360}, {"loss": 1.2239, "grad_norm": 0.7612091898918152, "learning_rate": 0.0002, "epoch": 5.33500837520938, "step": 6370}, {"loss": 1.2176, "grad_norm": 1.2524093389511108, "learning_rate": 0.0002, "epoch": 5.343383584589615, "step": 6380}, {"loss": 1.219, "grad_norm": 0.8481650352478027, "learning_rate": 0.0002, "epoch": 5.351758793969849, "step": 6390}, {"loss": 1.237, "grad_norm": 1.0562204122543335, "learning_rate": 0.0002, "epoch": 5.360134003350084, "step": 6400}, {"loss": 1.1844, "grad_norm": 0.96522456407547, "learning_rate": 0.0002, "epoch": 5.368509212730318, "step": 6410}, {"loss": 1.2465, "grad_norm": 0.9680143594741821, "learning_rate": 0.0002, "epoch": 5.376884422110553, "step": 6420}, {"loss": 1.2809, "grad_norm": 0.9743781685829163, "learning_rate": 0.0002, "epoch": 5.385259631490787, "step": 6430}, {"loss": 1.2637, "grad_norm": 0.8907374143600464, "learning_rate": 0.0002, "epoch": 5.393634840871022, "step": 6440}, {"loss": 1.2174, "grad_norm": 1.3755217790603638, "learning_rate": 0.0002, "epoch": 5.402010050251256, "step": 6450}, {"loss": 1.224, "grad_norm": 1.1926233768463135, "learning_rate": 0.0002, "epoch": 5.410385259631491, "step": 6460}, {"loss": 1.1685, "grad_norm": 0.8343448638916016, "learning_rate": 0.0002, "epoch": 5.418760469011725, "step": 6470}, {"loss": 1.232, "grad_norm": 1.0056027173995972, "learning_rate": 0.0002, "epoch": 5.42713567839196, "step": 6480}, {"loss": 1.2936, "grad_norm": 0.9482131600379944, "learning_rate": 0.0002, "epoch": 5.435510887772194, "step": 6490}, {"loss": 1.3084, "grad_norm": 0.9766585826873779, "learning_rate": 0.0002, "epoch": 5.443886097152429, "step": 6500}, {"loss": 1.2758, "grad_norm": 0.9226584434509277, "learning_rate": 0.0002, "epoch": 5.452261306532663, "step": 6510}, {"loss": 1.328, "grad_norm": 0.9605025053024292, "learning_rate": 0.0002, "epoch": 5.460636515912898, "step": 6520}, {"loss": 1.3285, "grad_norm": 1.0022773742675781, "learning_rate": 0.0002, "epoch": 5.469011725293132, "step": 6530}, {"loss": 1.3126, "grad_norm": 1.056764841079712, "learning_rate": 0.0002, "epoch": 5.477386934673367, "step": 6540}, {"loss": 1.3018, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 5.485762144053601, "step": 6550}, {"loss": 1.2633, "grad_norm": 0.8987206816673279, "learning_rate": 0.0002, "epoch": 5.494137353433836, "step": 6560}, {"loss": 1.2356, "grad_norm": 1.1946845054626465, "learning_rate": 0.0002, "epoch": 5.50251256281407, "step": 6570}, {"loss": 1.2613, "grad_norm": 1.037416696548462, "learning_rate": 0.0002, "epoch": 5.510887772194305, "step": 6580}, {"loss": 1.2873, "grad_norm": 1.085598349571228, "learning_rate": 0.0002, "epoch": 5.519262981574539, "step": 6590}, {"loss": 1.2562, "grad_norm": 0.9253745079040527, "learning_rate": 0.0002, "epoch": 5.527638190954773, "step": 6600}, {"loss": 1.3037, "grad_norm": 1.0624418258666992, "learning_rate": 0.0002, "epoch": 5.536013400335008, "step": 6610}, {"loss": 1.2523, "grad_norm": 1.002821922302246, "learning_rate": 0.0002, "epoch": 5.544388609715243, "step": 6620}, {"loss": 1.2662, "grad_norm": 0.9343662858009338, "learning_rate": 0.0002, "epoch": 5.552763819095477, "step": 6630}, {"loss": 1.2467, "grad_norm": 0.9129965305328369, "learning_rate": 0.0002, "epoch": 5.561139028475711, "step": 6640}, {"loss": 1.2931, "grad_norm": 1.220263957977295, "learning_rate": 0.0002, "epoch": 5.569514237855946, "step": 6650}, {"loss": 1.2638, "grad_norm": 0.9705421924591064, "learning_rate": 0.0002, "epoch": 5.577889447236181, "step": 6660}, {"loss": 1.2815, "grad_norm": 0.8417587876319885, "learning_rate": 0.0002, "epoch": 5.586264656616415, "step": 6670}, {"loss": 1.3616, "grad_norm": 0.9351304769515991, "learning_rate": 0.0002, "epoch": 5.594639865996649, "step": 6680}, {"loss": 1.2795, "grad_norm": 1.012598991394043, "learning_rate": 0.0002, "epoch": 5.603015075376884, "step": 6690}, {"loss": 1.2457, "grad_norm": 1.018328309059143, "learning_rate": 0.0002, "epoch": 5.611390284757119, "step": 6700}, {"loss": 1.3084, "grad_norm": 0.9289278388023376, "learning_rate": 0.0002, "epoch": 5.619765494137353, "step": 6710}, {"loss": 1.2645, "grad_norm": 0.8390841484069824, "learning_rate": 0.0002, "epoch": 5.628140703517588, "step": 6720}, {"loss": 1.2676, "grad_norm": 0.9989390969276428, "learning_rate": 0.0002, "epoch": 5.636515912897822, "step": 6730}, {"loss": 1.2937, "grad_norm": 1.0675761699676514, "learning_rate": 0.0002, "epoch": 5.644891122278057, "step": 6740}, {"loss": 1.2599, "grad_norm": 1.0649791955947876, "learning_rate": 0.0002, "epoch": 5.653266331658291, "step": 6750}, {"loss": 1.2191, "grad_norm": 0.8542222380638123, "learning_rate": 0.0002, "epoch": 5.661641541038526, "step": 6760}, {"loss": 1.2336, "grad_norm": 0.9148173928260803, "learning_rate": 0.0002, "epoch": 5.67001675041876, "step": 6770}, {"loss": 1.3286, "grad_norm": 0.978024423122406, "learning_rate": 0.0002, "epoch": 5.678391959798995, "step": 6780}, {"loss": 1.2821, "grad_norm": 1.0385138988494873, "learning_rate": 0.0002, "epoch": 5.686767169179229, "step": 6790}, {"loss": 1.218, "grad_norm": 0.9687889218330383, "learning_rate": 0.0002, "epoch": 5.695142378559464, "step": 6800}, {"loss": 1.3256, "grad_norm": 0.862335205078125, "learning_rate": 0.0002, "epoch": 5.703517587939698, "step": 6810}, {"loss": 1.2783, "grad_norm": 0.9729578495025635, "learning_rate": 0.0002, "epoch": 5.711892797319933, "step": 6820}, {"loss": 1.3318, "grad_norm": 0.8936806321144104, "learning_rate": 0.0002, "epoch": 5.720268006700167, "step": 6830}, {"loss": 1.27, "grad_norm": 0.9222455620765686, "learning_rate": 0.0002, "epoch": 5.728643216080402, "step": 6840}, {"loss": 1.2097, "grad_norm": 1.0584437847137451, "learning_rate": 0.0002, "epoch": 5.7370184254606365, "step": 6850}, {"loss": 1.2308, "grad_norm": 0.9114518165588379, "learning_rate": 0.0002, "epoch": 5.745393634840871, "step": 6860}, {"loss": 1.2767, "grad_norm": 0.9590078592300415, "learning_rate": 0.0002, "epoch": 5.7537688442211055, "step": 6870}, {"loss": 1.2639, "grad_norm": 0.9056822061538696, "learning_rate": 0.0002, "epoch": 5.76214405360134, "step": 6880}, {"loss": 1.3257, "grad_norm": 1.0069063901901245, "learning_rate": 0.0002, "epoch": 5.7705192629815745, "step": 6890}, {"loss": 1.3382, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 5.778894472361809, "step": 6900}, {"loss": 1.2907, "grad_norm": 0.881629228591919, "learning_rate": 0.0002, "epoch": 5.7872696817420435, "step": 6910}, {"loss": 1.3122, "grad_norm": 1.1020095348358154, "learning_rate": 0.0002, "epoch": 5.795644891122278, "step": 6920}, {"loss": 1.2985, "grad_norm": 0.8774619102478027, "learning_rate": 0.0002, "epoch": 5.8040201005025125, "step": 6930}, {"loss": 1.311, "grad_norm": 0.9321739673614502, "learning_rate": 0.0002, "epoch": 5.812395309882747, "step": 6940}, {"loss": 1.2951, "grad_norm": 0.9082857966423035, "learning_rate": 0.0002, "epoch": 5.8207705192629815, "step": 6950}, {"loss": 1.2582, "grad_norm": 0.9119554758071899, "learning_rate": 0.0002, "epoch": 5.8291457286432165, "step": 6960}, {"loss": 1.2777, "grad_norm": 1.0643284320831299, "learning_rate": 0.0002, "epoch": 5.8375209380234505, "step": 6970}, {"loss": 1.3319, "grad_norm": 0.8526089787483215, "learning_rate": 0.0002, "epoch": 5.8458961474036855, "step": 6980}, {"loss": 1.2539, "grad_norm": 0.930439829826355, "learning_rate": 0.0002, "epoch": 5.8542713567839195, "step": 6990}, {"loss": 1.3059, "grad_norm": 1.0461677312850952, "learning_rate": 0.0002, "epoch": 5.8626465661641545, "step": 7000}, {"loss": 1.2623, "grad_norm": 0.92561936378479, "learning_rate": 0.0002, "epoch": 5.8710217755443885, "step": 7010}, {"loss": 1.2354, "grad_norm": 0.8936395049095154, "learning_rate": 0.0002, "epoch": 5.8793969849246235, "step": 7020}, {"loss": 1.3232, "grad_norm": 0.986539363861084, "learning_rate": 0.0002, "epoch": 5.8877721943048575, "step": 7030}, {"loss": 1.2399, "grad_norm": 0.8776476383209229, "learning_rate": 0.0002, "epoch": 5.8961474036850925, "step": 7040}, {"loss": 1.2374, "grad_norm": 1.0256905555725098, "learning_rate": 0.0002, "epoch": 5.9045226130653266, "step": 7050}, {"loss": 1.3049, "grad_norm": 0.96241295337677, "learning_rate": 0.0002, "epoch": 5.9128978224455615, "step": 7060}, {"loss": 1.2349, "grad_norm": 1.0251280069351196, "learning_rate": 0.0002, "epoch": 5.921273031825796, "step": 7070}, {"loss": 1.2225, "grad_norm": 1.0794076919555664, "learning_rate": 0.0002, "epoch": 5.9296482412060305, "step": 7080}, {"loss": 1.2978, "grad_norm": 0.9852448105812073, "learning_rate": 0.0002, "epoch": 5.938023450586265, "step": 7090}, {"loss": 1.3278, "grad_norm": 1.1678671836853027, "learning_rate": 0.0002, "epoch": 5.9463986599664995, "step": 7100}, {"loss": 1.2908, "grad_norm": 0.9818310141563416, "learning_rate": 0.0002, "epoch": 5.954773869346734, "step": 7110}, {"loss": 1.3406, "grad_norm": 1.0732046365737915, "learning_rate": 0.0002, "epoch": 5.9631490787269685, "step": 7120}, {"loss": 1.2402, "grad_norm": 0.912470281124115, "learning_rate": 0.0002, "epoch": 5.971524288107203, "step": 7130}, {"loss": 1.2979, "grad_norm": 1.0944788455963135, "learning_rate": 0.0002, "epoch": 5.9798994974874375, "step": 7140}, {"loss": 1.3249, "grad_norm": 1.0393965244293213, "learning_rate": 0.0002, "epoch": 5.988274706867672, "step": 7150}, {"loss": 1.2913, "grad_norm": 0.8758739233016968, "learning_rate": 0.0002, "epoch": 5.9966499162479066, "step": 7160}, {"eval_loss": 2.0526134967803955, "eval_runtime": 37.9699, "eval_samples_per_second": 13.563, "eval_steps_per_second": 1.712, "epoch": 6.0, "step": 7164}, {"loss": 1.1352, "grad_norm": 1.138184666633606, "learning_rate": 0.0002, "epoch": 6.005025125628141, "step": 7170}, {"loss": 1.0727, "grad_norm": 0.9295315742492676, "learning_rate": 0.0002, "epoch": 6.013400335008376, "step": 7180}, {"loss": 1.0859, "grad_norm": 1.1252633333206177, "learning_rate": 0.0002, "epoch": 6.02177554438861, "step": 7190}, {"loss": 1.0827, "grad_norm": 1.0611635446548462, "learning_rate": 0.0002, "epoch": 6.030150753768845, "step": 7200}, {"loss": 1.0756, "grad_norm": 1.022278070449829, "learning_rate": 0.0002, "epoch": 6.038525963149079, "step": 7210}, {"loss": 1.0616, "grad_norm": 1.0280728340148926, "learning_rate": 0.0002, "epoch": 6.046901172529314, "step": 7220}, {"loss": 1.0237, "grad_norm": 0.9516313076019287, "learning_rate": 0.0002, "epoch": 6.055276381909548, "step": 7230}, {"loss": 1.0388, "grad_norm": 1.0925321578979492, "learning_rate": 0.0002, "epoch": 6.063651591289783, "step": 7240}, {"loss": 1.113, "grad_norm": 0.9885565042495728, "learning_rate": 0.0002, "epoch": 6.072026800670017, "step": 7250}, {"loss": 1.1167, "grad_norm": 1.0905766487121582, "learning_rate": 0.0002, "epoch": 6.080402010050252, "step": 7260}, {"loss": 1.0775, "grad_norm": 1.075183391571045, "learning_rate": 0.0002, "epoch": 6.088777219430486, "step": 7270}, {"loss": 1.1371, "grad_norm": 1.0897727012634277, "learning_rate": 0.0002, "epoch": 6.097152428810721, "step": 7280}, {"loss": 1.0335, "grad_norm": 1.3677806854248047, "learning_rate": 0.0002, "epoch": 6.105527638190955, "step": 7290}, {"loss": 1.0566, "grad_norm": 1.1880329847335815, "learning_rate": 0.0002, "epoch": 6.11390284757119, "step": 7300}, {"loss": 1.061, "grad_norm": 1.036330223083496, "learning_rate": 0.0002, "epoch": 6.122278056951424, "step": 7310}, {"loss": 1.0621, "grad_norm": 1.2165348529815674, "learning_rate": 0.0002, "epoch": 6.130653266331659, "step": 7320}, {"loss": 1.0796, "grad_norm": 1.027368187904358, "learning_rate": 0.0002, "epoch": 6.139028475711893, "step": 7330}, {"loss": 1.0994, "grad_norm": 1.2497830390930176, "learning_rate": 0.0002, "epoch": 6.147403685092128, "step": 7340}, {"loss": 1.1616, "grad_norm": 1.166595458984375, "learning_rate": 0.0002, "epoch": 6.155778894472362, "step": 7350}, {"loss": 1.1301, "grad_norm": 1.1143730878829956, "learning_rate": 0.0002, "epoch": 6.164154103852597, "step": 7360}, {"loss": 1.0913, "grad_norm": 1.1531223058700562, "learning_rate": 0.0002, "epoch": 6.172529313232831, "step": 7370}, {"loss": 1.0819, "grad_norm": 1.176507830619812, "learning_rate": 0.0002, "epoch": 6.180904522613066, "step": 7380}, {"loss": 1.0375, "grad_norm": 1.3174604177474976, "learning_rate": 0.0002, "epoch": 6.1892797319933, "step": 7390}, {"loss": 1.1586, "grad_norm": 1.0284459590911865, "learning_rate": 0.0002, "epoch": 6.197654941373535, "step": 7400}, {"loss": 1.1044, "grad_norm": 1.0801599025726318, "learning_rate": 0.0002, "epoch": 6.206030150753769, "step": 7410}, {"loss": 1.1441, "grad_norm": 1.200514554977417, "learning_rate": 0.0002, "epoch": 6.214405360134004, "step": 7420}, {"loss": 1.0234, "grad_norm": 1.0148060321807861, "learning_rate": 0.0002, "epoch": 6.222780569514238, "step": 7430}, {"loss": 1.0616, "grad_norm": 1.2368836402893066, "learning_rate": 0.0002, "epoch": 6.231155778894473, "step": 7440}, {"loss": 1.0781, "grad_norm": 1.228834629058838, "learning_rate": 0.0002, "epoch": 6.239530988274707, "step": 7450}, {"loss": 1.1128, "grad_norm": 1.1588891744613647, "learning_rate": 0.0002, "epoch": 6.247906197654942, "step": 7460}, {"loss": 1.0807, "grad_norm": 1.3500380516052246, "learning_rate": 0.0002, "epoch": 6.256281407035176, "step": 7470}, {"loss": 1.1057, "grad_norm": 1.1429533958435059, "learning_rate": 0.0002, "epoch": 6.264656616415411, "step": 7480}, {"loss": 1.1519, "grad_norm": 1.2314441204071045, "learning_rate": 0.0002, "epoch": 6.273031825795645, "step": 7490}, {"loss": 1.0885, "grad_norm": 1.0917996168136597, "learning_rate": 0.0002, "epoch": 6.28140703517588, "step": 7500}, {"loss": 1.0786, "grad_norm": 1.3294450044631958, "learning_rate": 0.0002, "epoch": 6.289782244556114, "step": 7510}, {"loss": 1.1187, "grad_norm": 1.1035195589065552, "learning_rate": 0.0002, "epoch": 6.298157453936349, "step": 7520}, {"loss": 1.1183, "grad_norm": 1.2643269300460815, "learning_rate": 0.0002, "epoch": 6.306532663316583, "step": 7530}, {"loss": 1.0767, "grad_norm": 1.2226417064666748, "learning_rate": 0.0002, "epoch": 6.314907872696818, "step": 7540}, {"loss": 1.1335, "grad_norm": 1.0248615741729736, "learning_rate": 0.0002, "epoch": 6.323283082077052, "step": 7550}, {"loss": 1.0856, "grad_norm": 1.28317129611969, "learning_rate": 0.0002, "epoch": 6.331658291457287, "step": 7560}, {"loss": 1.166, "grad_norm": 1.1461660861968994, "learning_rate": 0.0002, "epoch": 6.340033500837521, "step": 7570}, {"loss": 1.1627, "grad_norm": 1.297136664390564, "learning_rate": 0.0002, "epoch": 6.348408710217756, "step": 7580}, {"loss": 1.1342, "grad_norm": 1.3376781940460205, "learning_rate": 0.0002, "epoch": 6.35678391959799, "step": 7590}, {"loss": 1.072, "grad_norm": 1.2507376670837402, "learning_rate": 0.0002, "epoch": 6.365159128978225, "step": 7600}, {"loss": 1.0731, "grad_norm": 1.3255126476287842, "learning_rate": 0.0002, "epoch": 6.373534338358459, "step": 7610}, {"loss": 1.0818, "grad_norm": 1.1082066297531128, "learning_rate": 0.0002, "epoch": 6.381909547738694, "step": 7620}, {"loss": 1.0894, "grad_norm": 1.4461497068405151, "learning_rate": 0.0002, "epoch": 6.390284757118928, "step": 7630}, {"loss": 1.1443, "grad_norm": 1.2875033617019653, "learning_rate": 0.0002, "epoch": 6.398659966499163, "step": 7640}, {"loss": 1.1027, "grad_norm": 1.1017295122146606, "learning_rate": 0.0002, "epoch": 6.407035175879397, "step": 7650}, {"loss": 1.1046, "grad_norm": 1.1896536350250244, "learning_rate": 0.0002, "epoch": 6.415410385259632, "step": 7660}, {"loss": 1.1207, "grad_norm": 1.0939011573791504, "learning_rate": 0.0002, "epoch": 6.423785594639866, "step": 7670}, {"loss": 1.1338, "grad_norm": 1.2593132257461548, "learning_rate": 0.0002, "epoch": 6.432160804020101, "step": 7680}, {"loss": 1.071, "grad_norm": 1.1151225566864014, "learning_rate": 0.0002, "epoch": 6.440536013400335, "step": 7690}, {"loss": 1.1832, "grad_norm": 1.0686280727386475, "learning_rate": 0.0002, "epoch": 6.44891122278057, "step": 7700}, {"loss": 1.1611, "grad_norm": 1.4008738994598389, "learning_rate": 0.0002, "epoch": 6.457286432160804, "step": 7710}, {"loss": 1.1191, "grad_norm": 1.1698687076568604, "learning_rate": 0.0002, "epoch": 6.465661641541039, "step": 7720}, {"loss": 1.1637, "grad_norm": 1.1306401491165161, "learning_rate": 0.0002, "epoch": 6.474036850921273, "step": 7730}, {"loss": 1.1534, "grad_norm": 1.2970236539840698, "learning_rate": 0.0002, "epoch": 6.482412060301508, "step": 7740}, {"loss": 1.1408, "grad_norm": 1.1515544652938843, "learning_rate": 0.0002, "epoch": 6.490787269681742, "step": 7750}, {"loss": 1.098, "grad_norm": 1.13273024559021, "learning_rate": 0.0002, "epoch": 6.499162479061977, "step": 7760}, {"loss": 1.1356, "grad_norm": 1.1635724306106567, "learning_rate": 0.0002, "epoch": 6.507537688442211, "step": 7770}, {"loss": 1.0849, "grad_norm": 1.1620264053344727, "learning_rate": 0.0002, "epoch": 6.515912897822446, "step": 7780}, {"loss": 1.1786, "grad_norm": 1.159905195236206, "learning_rate": 0.0002, "epoch": 6.52428810720268, "step": 7790}, {"loss": 1.1252, "grad_norm": 1.2243341207504272, "learning_rate": 0.0002, "epoch": 6.532663316582915, "step": 7800}, {"loss": 1.1654, "grad_norm": 1.1034481525421143, "learning_rate": 0.0002, "epoch": 6.541038525963149, "step": 7810}, {"loss": 1.1579, "grad_norm": 1.1131408214569092, "learning_rate": 0.0002, "epoch": 6.549413735343384, "step": 7820}, {"loss": 1.1053, "grad_norm": 1.211260199546814, "learning_rate": 0.0002, "epoch": 6.557788944723618, "step": 7830}, {"loss": 1.1178, "grad_norm": 1.408692717552185, "learning_rate": 0.0002, "epoch": 6.566164154103853, "step": 7840}, {"loss": 1.1586, "grad_norm": 1.151441216468811, "learning_rate": 0.0002, "epoch": 6.574539363484087, "step": 7850}, {"loss": 1.1754, "grad_norm": 1.1160012483596802, "learning_rate": 0.0002, "epoch": 6.582914572864322, "step": 7860}, {"loss": 1.1092, "grad_norm": 1.2496052980422974, "learning_rate": 0.0002, "epoch": 6.591289782244556, "step": 7870}, {"loss": 1.2007, "grad_norm": 1.559907078742981, "learning_rate": 0.0002, "epoch": 6.599664991624791, "step": 7880}, {"loss": 1.1482, "grad_norm": 1.4399309158325195, "learning_rate": 0.0002, "epoch": 6.608040201005025, "step": 7890}, {"loss": 1.1801, "grad_norm": 1.155007243156433, "learning_rate": 0.0002, "epoch": 6.61641541038526, "step": 7900}, {"loss": 1.2029, "grad_norm": 1.4339076280593872, "learning_rate": 0.0002, "epoch": 6.624790619765494, "step": 7910}, {"loss": 1.1594, "grad_norm": 1.2093058824539185, "learning_rate": 0.0002, "epoch": 6.633165829145729, "step": 7920}, {"loss": 1.185, "grad_norm": 1.1619434356689453, "learning_rate": 0.0002, "epoch": 6.641541038525963, "step": 7930}, {"loss": 1.1369, "grad_norm": 1.2879594564437866, "learning_rate": 0.0002, "epoch": 6.649916247906198, "step": 7940}, {"loss": 1.1992, "grad_norm": 1.0598394870758057, "learning_rate": 0.0002, "epoch": 6.658291457286432, "step": 7950}, {"loss": 1.1337, "grad_norm": 1.0937503576278687, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 7960}, {"loss": 1.1137, "grad_norm": 1.2670115232467651, "learning_rate": 0.0002, "epoch": 6.675041876046901, "step": 7970}, {"loss": 1.1711, "grad_norm": 1.2351782321929932, "learning_rate": 0.0002, "epoch": 6.683417085427136, "step": 7980}, {"loss": 1.1774, "grad_norm": 1.344128131866455, "learning_rate": 0.0002, "epoch": 6.69179229480737, "step": 7990}, {"loss": 1.1739, "grad_norm": 1.2894740104675293, "learning_rate": 0.0002, "epoch": 6.700167504187605, "step": 8000}, {"loss": 1.1045, "grad_norm": 1.1804684400558472, "learning_rate": 0.0002, "epoch": 6.708542713567839, "step": 8010}, {"loss": 1.2371, "grad_norm": 1.314237356185913, "learning_rate": 0.0002, "epoch": 6.716917922948074, "step": 8020}, {"loss": 1.1113, "grad_norm": 1.2132530212402344, "learning_rate": 0.0002, "epoch": 6.725293132328308, "step": 8030}, {"loss": 1.1467, "grad_norm": 0.999580979347229, "learning_rate": 0.0002, "epoch": 6.733668341708543, "step": 8040}, {"loss": 1.1418, "grad_norm": 1.206323266029358, "learning_rate": 0.0002, "epoch": 6.742043551088777, "step": 8050}, {"loss": 1.1265, "grad_norm": 1.1092344522476196, "learning_rate": 0.0002, "epoch": 6.750418760469012, "step": 8060}, {"loss": 1.1583, "grad_norm": 1.0168755054473877, "learning_rate": 0.0002, "epoch": 6.758793969849246, "step": 8070}, {"loss": 1.189, "grad_norm": 1.2310614585876465, "learning_rate": 0.0002, "epoch": 6.767169179229481, "step": 8080}, {"loss": 1.1775, "grad_norm": 1.1587172746658325, "learning_rate": 0.0002, "epoch": 6.775544388609715, "step": 8090}, {"loss": 1.1761, "grad_norm": 1.1362504959106445, "learning_rate": 0.0002, "epoch": 6.78391959798995, "step": 8100}, {"loss": 1.1521, "grad_norm": 1.3735119104385376, "learning_rate": 0.0002, "epoch": 6.792294807370184, "step": 8110}, {"loss": 1.1214, "grad_norm": 1.1804813146591187, "learning_rate": 0.0002, "epoch": 6.800670016750419, "step": 8120}, {"loss": 1.1035, "grad_norm": 1.1849592924118042, "learning_rate": 0.0002, "epoch": 6.809045226130653, "step": 8130}, {"loss": 1.1622, "grad_norm": 1.1638602018356323, "learning_rate": 0.0002, "epoch": 6.817420435510888, "step": 8140}, {"loss": 1.1178, "grad_norm": 1.2106250524520874, "learning_rate": 0.0002, "epoch": 6.825795644891122, "step": 8150}, {"loss": 1.2231, "grad_norm": 1.276068091392517, "learning_rate": 0.0002, "epoch": 6.834170854271357, "step": 8160}, {"loss": 1.1309, "grad_norm": 1.4283488988876343, "learning_rate": 0.0002, "epoch": 6.842546063651591, "step": 8170}, {"loss": 1.1494, "grad_norm": 1.4286448955535889, "learning_rate": 0.0002, "epoch": 6.850921273031826, "step": 8180}, {"loss": 1.185, "grad_norm": 1.191275715827942, "learning_rate": 0.0002, "epoch": 6.85929648241206, "step": 8190}, {"loss": 1.1984, "grad_norm": 1.4232908487319946, "learning_rate": 0.0002, "epoch": 6.867671691792295, "step": 8200}, {"loss": 1.182, "grad_norm": 1.2166317701339722, "learning_rate": 0.0002, "epoch": 6.876046901172529, "step": 8210}, {"loss": 1.1311, "grad_norm": 1.0487027168273926, "learning_rate": 0.0002, "epoch": 6.884422110552764, "step": 8220}, {"loss": 1.1973, "grad_norm": 1.247178077697754, "learning_rate": 0.0002, "epoch": 6.892797319932998, "step": 8230}, {"loss": 1.0942, "grad_norm": 1.0728635787963867, "learning_rate": 0.0002, "epoch": 6.901172529313233, "step": 8240}, {"loss": 1.2106, "grad_norm": 1.1909451484680176, "learning_rate": 0.0002, "epoch": 6.909547738693467, "step": 8250}, {"loss": 1.1336, "grad_norm": 1.337556004524231, "learning_rate": 0.0002, "epoch": 6.917922948073702, "step": 8260}, {"loss": 1.2295, "grad_norm": 1.1479394435882568, "learning_rate": 0.0002, "epoch": 6.926298157453936, "step": 8270}, {"loss": 1.1497, "grad_norm": 1.2038872241973877, "learning_rate": 0.0002, "epoch": 6.934673366834171, "step": 8280}, {"loss": 1.1806, "grad_norm": 1.088813066482544, "learning_rate": 0.0002, "epoch": 6.943048576214405, "step": 8290}, {"loss": 1.181, "grad_norm": 1.0153290033340454, "learning_rate": 0.0002, "epoch": 6.95142378559464, "step": 8300}, {"loss": 1.1846, "grad_norm": 1.2159703969955444, "learning_rate": 0.0002, "epoch": 6.959798994974874, "step": 8310}, {"loss": 1.1029, "grad_norm": 1.0844143629074097, "learning_rate": 0.0002, "epoch": 6.968174204355109, "step": 8320}, {"loss": 1.1843, "grad_norm": 1.1617385149002075, "learning_rate": 0.0002, "epoch": 6.976549413735343, "step": 8330}, {"loss": 1.177, "grad_norm": 1.126503586769104, "learning_rate": 0.0002, "epoch": 6.984924623115578, "step": 8340}, {"loss": 1.1753, "grad_norm": 1.1553548574447632, "learning_rate": 0.0002, "epoch": 6.993299832495812, "step": 8350}]} +{"epoch": 8.0, "step": 9552, "epoch_duration": 1294.212232351303, "total_accumulated_duration": 10564.516312360764, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-1/checkpoint-2388", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6252, "grad_norm": 0.6290814280509949, "learning_rate": 0.0002, "epoch": 0.008375209380234505, "step": 10}, {"loss": 2.3237, "grad_norm": 0.5023976564407349, "learning_rate": 0.0002, "epoch": 0.01675041876046901, "step": 20}, {"loss": 2.1575, "grad_norm": 0.5448721647262573, "learning_rate": 0.0002, "epoch": 0.02512562814070352, "step": 30}, {"loss": 1.967, "grad_norm": 0.4906269609928131, "learning_rate": 0.0002, "epoch": 0.03350083752093802, "step": 40}, {"loss": 1.9464, "grad_norm": 0.49321722984313965, "learning_rate": 0.0002, "epoch": 0.04187604690117253, "step": 50}, {"loss": 1.9645, "grad_norm": 0.4470495581626892, "learning_rate": 0.0002, "epoch": 0.05025125628140704, "step": 60}, {"loss": 1.8989, "grad_norm": 0.49971723556518555, "learning_rate": 0.0002, "epoch": 0.05862646566164154, "step": 70}, {"loss": 1.8629, "grad_norm": 0.4249754548072815, "learning_rate": 0.0002, "epoch": 0.06700167504187604, "step": 80}, {"loss": 1.9229, "grad_norm": 0.43136730790138245, "learning_rate": 0.0002, "epoch": 0.07537688442211055, "step": 90}, {"loss": 1.8768, "grad_norm": 0.5939809679985046, "learning_rate": 0.0002, "epoch": 0.08375209380234507, "step": 100}, {"loss": 1.8811, "grad_norm": 0.4249511659145355, "learning_rate": 0.0002, "epoch": 0.09212730318257957, "step": 110}, {"loss": 1.8912, "grad_norm": 0.451865017414093, "learning_rate": 0.0002, "epoch": 0.10050251256281408, "step": 120}, {"loss": 1.8803, "grad_norm": 0.42394405603408813, "learning_rate": 0.0002, "epoch": 0.10887772194304858, "step": 130}, {"loss": 1.8411, "grad_norm": 0.3683006763458252, "learning_rate": 0.0002, "epoch": 0.11725293132328309, "step": 140}, {"loss": 1.8605, "grad_norm": 0.411150723695755, "learning_rate": 0.0002, "epoch": 0.12562814070351758, "step": 150}, {"loss": 1.7842, "grad_norm": 0.4213576018810272, "learning_rate": 0.0002, "epoch": 0.13400335008375208, "step": 160}, {"loss": 1.8892, "grad_norm": 0.4385589361190796, "learning_rate": 0.0002, "epoch": 0.1423785594639866, "step": 170}, {"loss": 1.8369, "grad_norm": 0.4446942210197449, "learning_rate": 0.0002, "epoch": 0.1507537688442211, "step": 180}, {"loss": 1.7757, "grad_norm": 0.4562969207763672, "learning_rate": 0.0002, "epoch": 0.15912897822445563, "step": 190}, {"loss": 1.8848, "grad_norm": 0.49195992946624756, "learning_rate": 0.0002, "epoch": 0.16750418760469013, "step": 200}, {"loss": 1.8127, "grad_norm": 0.3948725461959839, "learning_rate": 0.0002, "epoch": 0.17587939698492464, "step": 210}, {"loss": 1.7949, "grad_norm": 0.37087398767471313, "learning_rate": 0.0002, "epoch": 0.18425460636515914, "step": 220}, {"loss": 1.8392, "grad_norm": 0.3847447633743286, "learning_rate": 0.0002, "epoch": 0.19262981574539365, "step": 230}, {"loss": 1.7498, "grad_norm": 0.3973361849784851, "learning_rate": 0.0002, "epoch": 0.20100502512562815, "step": 240}, {"loss": 1.7662, "grad_norm": 0.3675636947154999, "learning_rate": 0.0002, "epoch": 0.20938023450586266, "step": 250}, {"loss": 1.8318, "grad_norm": 0.38187175989151, "learning_rate": 0.0002, "epoch": 0.21775544388609716, "step": 260}, {"loss": 1.8004, "grad_norm": 0.36000028252601624, "learning_rate": 0.0002, "epoch": 0.22613065326633167, "step": 270}, {"loss": 1.8129, "grad_norm": 0.3819858729839325, "learning_rate": 0.0002, "epoch": 0.23450586264656617, "step": 280}, {"loss": 1.7971, "grad_norm": 0.36370471119880676, "learning_rate": 0.0002, "epoch": 0.24288107202680068, "step": 290}, {"loss": 1.8518, "grad_norm": 0.3492966294288635, "learning_rate": 0.0002, "epoch": 0.25125628140703515, "step": 300}, {"loss": 1.8292, "grad_norm": 0.32806646823883057, "learning_rate": 0.0002, "epoch": 0.25963149078726966, "step": 310}, {"loss": 1.8338, "grad_norm": 0.3824801743030548, "learning_rate": 0.0002, "epoch": 0.26800670016750416, "step": 320}, {"loss": 1.8702, "grad_norm": 0.48781588673591614, "learning_rate": 0.0002, "epoch": 0.27638190954773867, "step": 330}, {"loss": 1.7858, "grad_norm": 0.416357159614563, "learning_rate": 0.0002, "epoch": 0.2847571189279732, "step": 340}, {"loss": 1.8543, "grad_norm": 0.34518781304359436, "learning_rate": 0.0002, "epoch": 0.2931323283082077, "step": 350}, {"loss": 1.7841, "grad_norm": 0.3333123028278351, "learning_rate": 0.0002, "epoch": 0.3015075376884422, "step": 360}, {"loss": 1.7434, "grad_norm": 0.4125552475452423, "learning_rate": 0.0002, "epoch": 0.3098827470686767, "step": 370}, {"loss": 1.8679, "grad_norm": 0.40044137835502625, "learning_rate": 0.0002, "epoch": 0.31825795644891125, "step": 380}, {"loss": 1.7615, "grad_norm": 0.44981154799461365, "learning_rate": 0.0002, "epoch": 0.32663316582914576, "step": 390}, {"loss": 1.7907, "grad_norm": 0.6972532868385315, "learning_rate": 0.0002, "epoch": 0.33500837520938026, "step": 400}, {"loss": 1.8159, "grad_norm": 0.3069273829460144, "learning_rate": 0.0002, "epoch": 0.34338358458961477, "step": 410}, {"loss": 1.8525, "grad_norm": 0.35586047172546387, "learning_rate": 0.0002, "epoch": 0.35175879396984927, "step": 420}, {"loss": 1.7714, "grad_norm": 0.40816494822502136, "learning_rate": 0.0002, "epoch": 0.3601340033500838, "step": 430}, {"loss": 1.8004, "grad_norm": 0.3377438187599182, "learning_rate": 0.0002, "epoch": 0.3685092127303183, "step": 440}, {"loss": 1.8658, "grad_norm": 0.31523144245147705, "learning_rate": 0.0002, "epoch": 0.3768844221105528, "step": 450}, {"loss": 1.771, "grad_norm": 0.3472132682800293, "learning_rate": 0.0002, "epoch": 0.3852596314907873, "step": 460}, {"loss": 1.808, "grad_norm": 0.3513853847980499, "learning_rate": 0.0002, "epoch": 0.3936348408710218, "step": 470}, {"loss": 1.7818, "grad_norm": 0.366720587015152, "learning_rate": 0.0002, "epoch": 0.4020100502512563, "step": 480}, {"loss": 1.7511, "grad_norm": 0.48535996675491333, "learning_rate": 0.0002, "epoch": 0.4103852596314908, "step": 490}, {"loss": 1.8674, "grad_norm": 0.378305584192276, "learning_rate": 0.0002, "epoch": 0.4187604690117253, "step": 500}, {"loss": 1.8145, "grad_norm": 0.31175753474235535, "learning_rate": 0.0002, "epoch": 0.4271356783919598, "step": 510}, {"loss": 1.7745, "grad_norm": 0.3505520820617676, "learning_rate": 0.0002, "epoch": 0.4355108877721943, "step": 520}, {"loss": 1.8194, "grad_norm": 0.3446848690509796, "learning_rate": 0.0002, "epoch": 0.4438860971524288, "step": 530}, {"loss": 1.7787, "grad_norm": 0.3255297541618347, "learning_rate": 0.0002, "epoch": 0.45226130653266333, "step": 540}, {"loss": 1.8456, "grad_norm": 0.3216710686683655, "learning_rate": 0.0002, "epoch": 0.46063651591289784, "step": 550}, {"loss": 1.7919, "grad_norm": 0.3307957649230957, "learning_rate": 0.0002, "epoch": 0.46901172529313234, "step": 560}, {"loss": 1.8659, "grad_norm": 0.3295125663280487, "learning_rate": 0.0002, "epoch": 0.47738693467336685, "step": 570}, {"loss": 1.7518, "grad_norm": 0.349960595369339, "learning_rate": 0.0002, "epoch": 0.48576214405360135, "step": 580}, {"loss": 1.8474, "grad_norm": 0.32447564601898193, "learning_rate": 0.0002, "epoch": 0.49413735343383586, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3343949615955353, "learning_rate": 0.0002, "epoch": 0.5025125628140703, "step": 600}, {"loss": 1.7856, "grad_norm": 0.3556120991706848, "learning_rate": 0.0002, "epoch": 0.5108877721943048, "step": 610}, {"loss": 1.7425, "grad_norm": 0.38598525524139404, "learning_rate": 0.0002, "epoch": 0.5192629815745393, "step": 620}, {"loss": 1.7857, "grad_norm": 0.3493153154850006, "learning_rate": 0.0002, "epoch": 0.5276381909547738, "step": 630}, {"loss": 1.7699, "grad_norm": 0.35715600848197937, "learning_rate": 0.0002, "epoch": 0.5360134003350083, "step": 640}, {"loss": 1.8295, "grad_norm": 0.3686097264289856, "learning_rate": 0.0002, "epoch": 0.5443886097152428, "step": 650}, {"loss": 1.775, "grad_norm": 0.32571321725845337, "learning_rate": 0.0002, "epoch": 0.5527638190954773, "step": 660}, {"loss": 1.7448, "grad_norm": 0.33986029028892517, "learning_rate": 0.0002, "epoch": 0.5611390284757118, "step": 670}, {"loss": 1.7874, "grad_norm": 0.33575883507728577, "learning_rate": 0.0002, "epoch": 0.5695142378559463, "step": 680}, {"loss": 1.8046, "grad_norm": 0.30621081590652466, "learning_rate": 0.0002, "epoch": 0.5778894472361809, "step": 690}, {"loss": 1.797, "grad_norm": 0.30717912316322327, "learning_rate": 0.0002, "epoch": 0.5862646566164154, "step": 700}, {"loss": 1.7696, "grad_norm": 0.33896031975746155, "learning_rate": 0.0002, "epoch": 0.5946398659966499, "step": 710}, {"loss": 1.8045, "grad_norm": 0.35164183378219604, "learning_rate": 0.0002, "epoch": 0.6030150753768844, "step": 720}, {"loss": 1.8606, "grad_norm": 0.47714051604270935, "learning_rate": 0.0002, "epoch": 0.6113902847571189, "step": 730}, {"loss": 1.8014, "grad_norm": 0.34266430139541626, "learning_rate": 0.0002, "epoch": 0.6197654941373534, "step": 740}, {"loss": 1.756, "grad_norm": 0.354221910238266, "learning_rate": 0.0002, "epoch": 0.628140703517588, "step": 750}, {"loss": 1.7244, "grad_norm": 0.3694717586040497, "learning_rate": 0.0002, "epoch": 0.6365159128978225, "step": 760}, {"loss": 1.7441, "grad_norm": 0.35219788551330566, "learning_rate": 0.0002, "epoch": 0.644891122278057, "step": 770}, {"loss": 1.8616, "grad_norm": 0.31869757175445557, "learning_rate": 0.0002, "epoch": 0.6532663316582915, "step": 780}, {"loss": 1.7981, "grad_norm": 0.3729475736618042, "learning_rate": 0.0002, "epoch": 0.661641541038526, "step": 790}, {"loss": 1.8384, "grad_norm": 0.3431633710861206, "learning_rate": 0.0002, "epoch": 0.6700167504187605, "step": 800}, {"loss": 1.7431, "grad_norm": 0.3452960252761841, "learning_rate": 0.0002, "epoch": 0.678391959798995, "step": 810}, {"loss": 1.8003, "grad_norm": 0.31068870425224304, "learning_rate": 0.0002, "epoch": 0.6867671691792295, "step": 820}, {"loss": 1.8275, "grad_norm": 0.3213907778263092, "learning_rate": 0.0002, "epoch": 0.695142378559464, "step": 830}, {"loss": 1.7975, "grad_norm": 0.2922039330005646, "learning_rate": 0.0002, "epoch": 0.7035175879396985, "step": 840}, {"loss": 1.817, "grad_norm": 0.36271268129348755, "learning_rate": 0.0002, "epoch": 0.711892797319933, "step": 850}, {"loss": 1.7644, "grad_norm": 0.3195357918739319, "learning_rate": 0.0002, "epoch": 0.7202680067001676, "step": 860}, {"loss": 1.8334, "grad_norm": 0.31721433997154236, "learning_rate": 0.0002, "epoch": 0.7286432160804021, "step": 870}, {"loss": 1.832, "grad_norm": 0.32121971249580383, "learning_rate": 0.0002, "epoch": 0.7370184254606366, "step": 880}, {"loss": 1.7315, "grad_norm": 0.3149084150791168, "learning_rate": 0.0002, "epoch": 0.7453936348408711, "step": 890}, {"loss": 1.8399, "grad_norm": 0.38880932331085205, "learning_rate": 0.0002, "epoch": 0.7537688442211056, "step": 900}, {"loss": 1.6838, "grad_norm": 0.31491366028785706, "learning_rate": 0.0002, "epoch": 0.7621440536013401, "step": 910}, {"loss": 1.8054, "grad_norm": 0.2900884449481964, "learning_rate": 0.0002, "epoch": 0.7705192629815746, "step": 920}, {"loss": 1.7352, "grad_norm": 0.31911659240722656, "learning_rate": 0.0002, "epoch": 0.7788944723618091, "step": 930}, {"loss": 1.8334, "grad_norm": 0.33131274580955505, "learning_rate": 0.0002, "epoch": 0.7872696817420436, "step": 940}, {"loss": 1.8077, "grad_norm": 0.2980491816997528, "learning_rate": 0.0002, "epoch": 0.7956448911222781, "step": 950}, {"loss": 1.8254, "grad_norm": 0.3282995820045471, "learning_rate": 0.0002, "epoch": 0.8040201005025126, "step": 960}, {"loss": 1.7695, "grad_norm": 0.3234929144382477, "learning_rate": 0.0002, "epoch": 0.8123953098827471, "step": 970}, {"loss": 1.8491, "grad_norm": 0.31825992465019226, "learning_rate": 0.0002, "epoch": 0.8207705192629816, "step": 980}, {"loss": 1.8002, "grad_norm": 0.32733580470085144, "learning_rate": 0.0002, "epoch": 0.8291457286432161, "step": 990}, {"loss": 1.8407, "grad_norm": 0.3082098066806793, "learning_rate": 0.0002, "epoch": 0.8375209380234506, "step": 1000}, {"loss": 1.7784, "grad_norm": 0.32492074370384216, "learning_rate": 0.0002, "epoch": 0.8458961474036851, "step": 1010}, {"loss": 1.839, "grad_norm": 0.3304888904094696, "learning_rate": 0.0002, "epoch": 0.8542713567839196, "step": 1020}, {"loss": 1.808, "grad_norm": 0.3304980397224426, "learning_rate": 0.0002, "epoch": 0.8626465661641541, "step": 1030}, {"loss": 1.8345, "grad_norm": 0.3537079989910126, "learning_rate": 0.0002, "epoch": 0.8710217755443886, "step": 1040}, {"loss": 1.7469, "grad_norm": 0.34958404302597046, "learning_rate": 0.0002, "epoch": 0.8793969849246231, "step": 1050}, {"loss": 1.8036, "grad_norm": 0.34610459208488464, "learning_rate": 0.0002, "epoch": 0.8877721943048577, "step": 1060}, {"loss": 1.7629, "grad_norm": 0.35725486278533936, "learning_rate": 0.0002, "epoch": 0.8961474036850922, "step": 1070}, {"loss": 1.7997, "grad_norm": 0.30205485224723816, "learning_rate": 0.0002, "epoch": 0.9045226130653267, "step": 1080}, {"loss": 1.7749, "grad_norm": 0.3658352196216583, "learning_rate": 0.0002, "epoch": 0.9128978224455612, "step": 1090}, {"loss": 1.7844, "grad_norm": 0.33731144666671753, "learning_rate": 0.0002, "epoch": 0.9212730318257957, "step": 1100}, {"loss": 1.8047, "grad_norm": 0.35221847891807556, "learning_rate": 0.0002, "epoch": 0.9296482412060302, "step": 1110}, {"loss": 1.7892, "grad_norm": 0.3193749487400055, "learning_rate": 0.0002, "epoch": 0.9380234505862647, "step": 1120}, {"loss": 1.7073, "grad_norm": 0.29893460869789124, "learning_rate": 0.0002, "epoch": 0.9463986599664992, "step": 1130}, {"loss": 1.8226, "grad_norm": 0.37168779969215393, "learning_rate": 0.0002, "epoch": 0.9547738693467337, "step": 1140}, {"loss": 1.7994, "grad_norm": 0.3465111255645752, "learning_rate": 0.0002, "epoch": 0.9631490787269682, "step": 1150}, {"loss": 1.8583, "grad_norm": 0.33802181482315063, "learning_rate": 0.0002, "epoch": 0.9715242881072027, "step": 1160}, {"loss": 1.8652, "grad_norm": 0.36273202300071716, "learning_rate": 0.0002, "epoch": 0.9798994974874372, "step": 1170}, {"loss": 1.7968, "grad_norm": 0.33043375611305237, "learning_rate": 0.0002, "epoch": 0.9882747068676717, "step": 1180}, {"loss": 1.729, "grad_norm": 0.3027370870113373, "learning_rate": 0.0002, "epoch": 0.9966499162479062, "step": 1190}, {"eval_loss": 1.8088148832321167, "eval_runtime": 37.9609, "eval_samples_per_second": 13.567, "eval_steps_per_second": 1.712, "epoch": 1.0, "step": 1194}, {"loss": 1.7492, "grad_norm": 0.4256260097026825, "learning_rate": 0.0002, "epoch": 1.0050251256281406, "step": 1200}, {"loss": 1.6994, "grad_norm": 0.35050156712532043, "learning_rate": 0.0002, "epoch": 1.0134003350083751, "step": 1210}, {"loss": 1.7422, "grad_norm": 0.34773948788642883, "learning_rate": 0.0002, "epoch": 1.0217755443886096, "step": 1220}, {"loss": 1.7803, "grad_norm": 0.35487470030784607, "learning_rate": 0.0002, "epoch": 1.0301507537688441, "step": 1230}, {"loss": 1.7095, "grad_norm": 0.37040361762046814, "learning_rate": 0.0002, "epoch": 1.0385259631490786, "step": 1240}, {"loss": 1.7663, "grad_norm": 0.33740508556365967, "learning_rate": 0.0002, "epoch": 1.0469011725293131, "step": 1250}, {"loss": 1.7485, "grad_norm": 0.3962724506855011, "learning_rate": 0.0002, "epoch": 1.0552763819095476, "step": 1260}, {"loss": 1.7334, "grad_norm": 0.3129824101924896, "learning_rate": 0.0002, "epoch": 1.0636515912897822, "step": 1270}, {"loss": 1.8068, "grad_norm": 0.3620055019855499, "learning_rate": 0.0002, "epoch": 1.0720268006700167, "step": 1280}, {"loss": 1.7823, "grad_norm": 0.3480982184410095, "learning_rate": 0.0002, "epoch": 1.0804020100502512, "step": 1290}, {"loss": 1.7081, "grad_norm": 0.344424843788147, "learning_rate": 0.0002, "epoch": 1.0887772194304857, "step": 1300}, {"loss": 1.7366, "grad_norm": 0.3480122685432434, "learning_rate": 0.0002, "epoch": 1.0971524288107202, "step": 1310}, {"loss": 1.7029, "grad_norm": 0.323662132024765, "learning_rate": 0.0002, "epoch": 1.1055276381909547, "step": 1320}, {"loss": 1.7517, "grad_norm": 0.35440102219581604, "learning_rate": 0.0002, "epoch": 1.1139028475711892, "step": 1330}, {"loss": 1.7573, "grad_norm": 0.3342263698577881, "learning_rate": 0.0002, "epoch": 1.1222780569514237, "step": 1340}, {"loss": 1.7134, "grad_norm": 0.35705259442329407, "learning_rate": 0.0002, "epoch": 1.1306532663316582, "step": 1350}, {"loss": 1.64, "grad_norm": 0.38021907210350037, "learning_rate": 0.0002, "epoch": 1.1390284757118927, "step": 1360}, {"loss": 1.66, "grad_norm": 0.34918731451034546, "learning_rate": 0.0002, "epoch": 1.1474036850921272, "step": 1370}, {"loss": 1.7628, "grad_norm": 0.371868371963501, "learning_rate": 0.0002, "epoch": 1.1557788944723617, "step": 1380}, {"loss": 1.725, "grad_norm": 0.38413912057876587, "learning_rate": 0.0002, "epoch": 1.1641541038525962, "step": 1390}, {"loss": 1.6948, "grad_norm": 0.3898005187511444, "learning_rate": 0.0002, "epoch": 1.1725293132328307, "step": 1400}, {"loss": 1.8105, "grad_norm": 0.3726498484611511, "learning_rate": 0.0002, "epoch": 1.1809045226130652, "step": 1410}, {"loss": 1.7379, "grad_norm": 0.3532905876636505, "learning_rate": 0.0002, "epoch": 1.1892797319932997, "step": 1420}, {"loss": 1.6699, "grad_norm": 0.338127464056015, "learning_rate": 0.0002, "epoch": 1.1976549413735342, "step": 1430}, {"loss": 1.871, "grad_norm": 0.3472749888896942, "learning_rate": 0.0002, "epoch": 1.2060301507537687, "step": 1440}, {"loss": 1.7092, "grad_norm": 0.3523476719856262, "learning_rate": 0.0002, "epoch": 1.2144053601340032, "step": 1450}, {"loss": 1.7329, "grad_norm": 0.42986124753952026, "learning_rate": 0.0002, "epoch": 1.2227805695142377, "step": 1460}, {"loss": 1.7459, "grad_norm": 0.38195517659187317, "learning_rate": 0.0002, "epoch": 1.2311557788944723, "step": 1470}, {"loss": 1.7539, "grad_norm": 0.31665122509002686, "learning_rate": 0.0002, "epoch": 1.2395309882747068, "step": 1480}, {"loss": 1.7224, "grad_norm": 0.3539541959762573, "learning_rate": 0.0002, "epoch": 1.2479061976549413, "step": 1490}, {"loss": 1.7655, "grad_norm": 0.40162816643714905, "learning_rate": 0.0002, "epoch": 1.2562814070351758, "step": 1500}, {"loss": 1.702, "grad_norm": 0.34727150201797485, "learning_rate": 0.0002, "epoch": 1.2646566164154103, "step": 1510}, {"loss": 1.7804, "grad_norm": 0.3364993929862976, "learning_rate": 0.0002, "epoch": 1.2730318257956448, "step": 1520}, {"loss": 1.8063, "grad_norm": 0.323483943939209, "learning_rate": 0.0002, "epoch": 1.2814070351758793, "step": 1530}, {"loss": 1.7622, "grad_norm": 0.4114733934402466, "learning_rate": 0.0002, "epoch": 1.2897822445561138, "step": 1540}, {"loss": 1.6525, "grad_norm": 0.37476620078086853, "learning_rate": 0.0002, "epoch": 1.2981574539363483, "step": 1550}, {"loss": 1.7225, "grad_norm": 0.4216269552707672, "learning_rate": 0.0002, "epoch": 1.3065326633165828, "step": 1560}, {"loss": 1.6995, "grad_norm": 0.3204927444458008, "learning_rate": 0.0002, "epoch": 1.3149078726968173, "step": 1570}, {"loss": 1.7132, "grad_norm": 0.36916354298591614, "learning_rate": 0.0002, "epoch": 1.3232830820770518, "step": 1580}, {"loss": 1.7383, "grad_norm": 0.3755691647529602, "learning_rate": 0.0002, "epoch": 1.3316582914572863, "step": 1590}, {"loss": 1.7351, "grad_norm": 0.3688889443874359, "learning_rate": 0.0002, "epoch": 1.3400335008375208, "step": 1600}, {"loss": 1.7664, "grad_norm": 0.34306398034095764, "learning_rate": 0.0002, "epoch": 1.3484087102177553, "step": 1610}, {"loss": 1.6943, "grad_norm": 0.3651525676250458, "learning_rate": 0.0002, "epoch": 1.3567839195979898, "step": 1620}, {"loss": 1.7206, "grad_norm": 0.3461526036262512, "learning_rate": 0.0002, "epoch": 1.3651591289782243, "step": 1630}, {"loss": 1.728, "grad_norm": 0.37959185242652893, "learning_rate": 0.0002, "epoch": 1.3735343383584588, "step": 1640}, {"loss": 1.746, "grad_norm": 0.4005356431007385, "learning_rate": 0.0002, "epoch": 1.3819095477386933, "step": 1650}, {"loss": 1.694, "grad_norm": 0.3537434935569763, "learning_rate": 0.0002, "epoch": 1.3902847571189278, "step": 1660}, {"loss": 1.6679, "grad_norm": 0.38220855593681335, "learning_rate": 0.0002, "epoch": 1.3986599664991624, "step": 1670}, {"loss": 1.7721, "grad_norm": 0.3573434352874756, "learning_rate": 0.0002, "epoch": 1.4070351758793969, "step": 1680}, {"loss": 1.6983, "grad_norm": 0.40028059482574463, "learning_rate": 0.0002, "epoch": 1.4154103852596314, "step": 1690}, {"loss": 1.7049, "grad_norm": 0.3953610360622406, "learning_rate": 0.0002, "epoch": 1.4237855946398659, "step": 1700}, {"loss": 1.7126, "grad_norm": 0.39524543285369873, "learning_rate": 0.0002, "epoch": 1.4321608040201004, "step": 1710}, {"loss": 1.8319, "grad_norm": 0.37721359729766846, "learning_rate": 0.0002, "epoch": 1.4405360134003349, "step": 1720}, {"loss": 1.7387, "grad_norm": 0.4220093786716461, "learning_rate": 0.0002, "epoch": 1.4489112227805694, "step": 1730}, {"loss": 1.7495, "grad_norm": 0.3876369595527649, "learning_rate": 0.0002, "epoch": 1.457286432160804, "step": 1740}, {"loss": 1.6859, "grad_norm": 0.3774619400501251, "learning_rate": 0.0002, "epoch": 1.4656616415410384, "step": 1750}, {"loss": 1.7223, "grad_norm": 0.3608052432537079, "learning_rate": 0.0002, "epoch": 1.474036850921273, "step": 1760}, {"loss": 1.6746, "grad_norm": 0.32083916664123535, "learning_rate": 0.0002, "epoch": 1.4824120603015074, "step": 1770}, {"loss": 1.716, "grad_norm": 0.32290884852409363, "learning_rate": 0.0002, "epoch": 1.490787269681742, "step": 1780}, {"loss": 1.7648, "grad_norm": 0.3537974953651428, "learning_rate": 0.0002, "epoch": 1.4991624790619764, "step": 1790}, {"loss": 1.6784, "grad_norm": 0.36576104164123535, "learning_rate": 0.0002, "epoch": 1.507537688442211, "step": 1800}, {"loss": 1.6818, "grad_norm": 0.3336752653121948, "learning_rate": 0.0002, "epoch": 1.5159128978224454, "step": 1810}, {"loss": 1.7425, "grad_norm": 0.3551652431488037, "learning_rate": 0.0002, "epoch": 1.52428810720268, "step": 1820}, {"loss": 1.6997, "grad_norm": 0.43313586711883545, "learning_rate": 0.0002, "epoch": 1.5326633165829144, "step": 1830}, {"loss": 1.7358, "grad_norm": 0.39160311222076416, "learning_rate": 0.0002, "epoch": 1.541038525963149, "step": 1840}, {"loss": 1.7709, "grad_norm": 0.38758179545402527, "learning_rate": 0.0002, "epoch": 1.5494137353433834, "step": 1850}, {"loss": 1.7768, "grad_norm": 0.3658832013607025, "learning_rate": 0.0002, "epoch": 1.557788944723618, "step": 1860}, {"loss": 1.7486, "grad_norm": 0.375372052192688, "learning_rate": 0.0002, "epoch": 1.5661641541038525, "step": 1870}, {"loss": 1.6555, "grad_norm": 0.3586942255496979, "learning_rate": 0.0002, "epoch": 1.574539363484087, "step": 1880}, {"loss": 1.7314, "grad_norm": 0.3626467287540436, "learning_rate": 0.0002, "epoch": 1.5829145728643215, "step": 1890}, {"loss": 1.7943, "grad_norm": 0.4199363589286804, "learning_rate": 0.0002, "epoch": 1.591289782244556, "step": 1900}, {"loss": 1.6551, "grad_norm": 0.35646331310272217, "learning_rate": 0.0002, "epoch": 1.5996649916247905, "step": 1910}, {"loss": 1.7125, "grad_norm": 0.3465106189250946, "learning_rate": 0.0002, "epoch": 1.608040201005025, "step": 1920}, {"loss": 1.8507, "grad_norm": 0.43392884731292725, "learning_rate": 0.0002, "epoch": 1.6164154103852595, "step": 1930}, {"loss": 1.7009, "grad_norm": 0.39187198877334595, "learning_rate": 0.0002, "epoch": 1.624790619765494, "step": 1940}, {"loss": 1.7202, "grad_norm": 0.3685080409049988, "learning_rate": 0.0002, "epoch": 1.6331658291457285, "step": 1950}, {"loss": 1.6607, "grad_norm": 0.4044491946697235, "learning_rate": 0.0002, "epoch": 1.641541038525963, "step": 1960}, {"loss": 1.7234, "grad_norm": 0.4388049244880676, "learning_rate": 0.0002, "epoch": 1.6499162479061975, "step": 1970}, {"loss": 1.7178, "grad_norm": 0.36165162920951843, "learning_rate": 0.0002, "epoch": 1.658291457286432, "step": 1980}, {"loss": 1.75, "grad_norm": 0.3501148521900177, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1990}, {"loss": 1.7057, "grad_norm": 0.3751881718635559, "learning_rate": 0.0002, "epoch": 1.675041876046901, "step": 2000}, {"loss": 1.7209, "grad_norm": 0.3902788460254669, "learning_rate": 0.0002, "epoch": 1.6834170854271355, "step": 2010}, {"loss": 1.8517, "grad_norm": 0.39642134308815, "learning_rate": 0.0002, "epoch": 1.69179229480737, "step": 2020}, {"loss": 1.6623, "grad_norm": 0.35721203684806824, "learning_rate": 0.0002, "epoch": 1.7001675041876045, "step": 2030}, {"loss": 1.6988, "grad_norm": 0.360419899225235, "learning_rate": 0.0002, "epoch": 1.708542713567839, "step": 2040}, {"loss": 1.691, "grad_norm": 0.3755600154399872, "learning_rate": 0.0002, "epoch": 1.7169179229480735, "step": 2050}, {"loss": 1.6726, "grad_norm": 0.3939184844493866, "learning_rate": 0.0002, "epoch": 1.725293132328308, "step": 2060}, {"loss": 1.7326, "grad_norm": 0.33955490589141846, "learning_rate": 0.0002, "epoch": 1.7336683417085426, "step": 2070}, {"loss": 1.6794, "grad_norm": 0.35501939058303833, "learning_rate": 0.0002, "epoch": 1.742043551088777, "step": 2080}, {"loss": 1.7312, "grad_norm": 0.38298022747039795, "learning_rate": 0.0002, "epoch": 1.7504187604690116, "step": 2090}, {"loss": 1.6602, "grad_norm": 0.3472785949707031, "learning_rate": 0.0002, "epoch": 1.758793969849246, "step": 2100}, {"loss": 1.6671, "grad_norm": 0.3620430827140808, "learning_rate": 0.0002, "epoch": 1.7671691792294806, "step": 2110}, {"loss": 1.671, "grad_norm": 0.3795909881591797, "learning_rate": 0.0002, "epoch": 1.775544388609715, "step": 2120}, {"loss": 1.7193, "grad_norm": 0.3662523925304413, "learning_rate": 0.0002, "epoch": 1.7839195979899496, "step": 2130}, {"loss": 1.7764, "grad_norm": 0.4113886058330536, "learning_rate": 0.0002, "epoch": 1.792294807370184, "step": 2140}, {"loss": 1.6681, "grad_norm": 0.3765672743320465, "learning_rate": 0.0002, "epoch": 1.8006700167504186, "step": 2150}, {"loss": 1.7481, "grad_norm": 0.41623714566230774, "learning_rate": 0.0002, "epoch": 1.809045226130653, "step": 2160}, {"loss": 1.712, "grad_norm": 0.3724099099636078, "learning_rate": 0.0002, "epoch": 1.8174204355108876, "step": 2170}, {"loss": 1.6912, "grad_norm": 0.3990779221057892, "learning_rate": 0.0002, "epoch": 1.8257956448911221, "step": 2180}, {"loss": 1.7361, "grad_norm": 0.3677702844142914, "learning_rate": 0.0002, "epoch": 1.8341708542713566, "step": 2190}, {"loss": 1.6705, "grad_norm": 0.3944959342479706, "learning_rate": 0.0002, "epoch": 1.8425460636515911, "step": 2200}, {"loss": 1.7619, "grad_norm": 0.3413957357406616, "learning_rate": 0.0002, "epoch": 1.8509212730318256, "step": 2210}, {"loss": 1.7069, "grad_norm": 0.40136098861694336, "learning_rate": 0.0002, "epoch": 1.8592964824120601, "step": 2220}, {"loss": 1.6865, "grad_norm": 0.3496319055557251, "learning_rate": 0.0002, "epoch": 1.8676716917922946, "step": 2230}, {"loss": 1.6906, "grad_norm": 0.3759860694408417, "learning_rate": 0.0002, "epoch": 1.8760469011725294, "step": 2240}, {"loss": 1.8394, "grad_norm": 0.43556007742881775, "learning_rate": 0.0002, "epoch": 1.8844221105527639, "step": 2250}, {"loss": 1.66, "grad_norm": 0.3864828944206238, "learning_rate": 0.0002, "epoch": 1.8927973199329984, "step": 2260}, {"loss": 1.6502, "grad_norm": 0.396930456161499, "learning_rate": 0.0002, "epoch": 1.9011725293132329, "step": 2270}, {"loss": 1.838, "grad_norm": 0.37667879462242126, "learning_rate": 0.0002, "epoch": 1.9095477386934674, "step": 2280}, {"loss": 1.7315, "grad_norm": 0.3539164066314697, "learning_rate": 0.0002, "epoch": 1.917922948073702, "step": 2290}, {"loss": 1.7589, "grad_norm": 0.40542101860046387, "learning_rate": 0.0002, "epoch": 1.9262981574539364, "step": 2300}, {"loss": 1.6795, "grad_norm": 0.37341606616973877, "learning_rate": 0.0002, "epoch": 1.934673366834171, "step": 2310}, {"loss": 1.7058, "grad_norm": 0.4011504352092743, "learning_rate": 0.0002, "epoch": 1.9430485762144054, "step": 2320}, {"loss": 1.688, "grad_norm": 0.37934592366218567, "learning_rate": 0.0002, "epoch": 1.95142378559464, "step": 2330}, {"loss": 1.6699, "grad_norm": 0.32745009660720825, "learning_rate": 0.0002, "epoch": 1.9597989949748744, "step": 2340}, {"loss": 1.7673, "grad_norm": 0.38347750902175903, "learning_rate": 0.0002, "epoch": 1.968174204355109, "step": 2350}, {"loss": 1.7116, "grad_norm": 0.3945120871067047, "learning_rate": 0.0002, "epoch": 1.9765494137353434, "step": 2360}, {"loss": 1.7559, "grad_norm": 0.4034058749675751, "learning_rate": 0.0002, "epoch": 1.984924623115578, "step": 2370}, {"loss": 1.7254, "grad_norm": 0.3546718955039978, "learning_rate": 0.0002, "epoch": 1.9932998324958124, "step": 2380}, {"eval_loss": 1.8061236143112183, "eval_runtime": 38.2113, "eval_samples_per_second": 13.478, "eval_steps_per_second": 1.701, "epoch": 2.0, "step": 2388}, {"loss": 1.7203, "grad_norm": 0.35184019804000854, "learning_rate": 0.0002, "epoch": 2.0016750418760467, "step": 2390}, {"loss": 1.6124, "grad_norm": 0.40416669845581055, "learning_rate": 0.0002, "epoch": 2.0100502512562812, "step": 2400}, {"loss": 1.6092, "grad_norm": 0.3824569880962372, "learning_rate": 0.0002, "epoch": 2.0184254606365157, "step": 2410}, {"loss": 1.641, "grad_norm": 0.42036163806915283, "learning_rate": 0.0002, "epoch": 2.0268006700167502, "step": 2420}, {"loss": 1.6176, "grad_norm": 0.40417996048927307, "learning_rate": 0.0002, "epoch": 2.0351758793969847, "step": 2430}, {"loss": 1.643, "grad_norm": 0.45298922061920166, "learning_rate": 0.0002, "epoch": 2.0435510887772192, "step": 2440}, {"loss": 1.653, "grad_norm": 0.48289841413497925, "learning_rate": 0.0002, "epoch": 2.0519262981574538, "step": 2450}, {"loss": 1.5275, "grad_norm": 0.43702399730682373, "learning_rate": 0.0002, "epoch": 2.0603015075376883, "step": 2460}, {"loss": 1.5825, "grad_norm": 0.49487054347991943, "learning_rate": 0.0002, "epoch": 2.0686767169179228, "step": 2470}, {"loss": 1.6552, "grad_norm": 0.40030500292778015, "learning_rate": 0.0002, "epoch": 2.0770519262981573, "step": 2480}, {"loss": 1.614, "grad_norm": 0.4664880037307739, "learning_rate": 0.0002, "epoch": 2.0854271356783918, "step": 2490}, {"loss": 1.6589, "grad_norm": 0.4111400842666626, "learning_rate": 0.0002, "epoch": 2.0938023450586263, "step": 2500}, {"loss": 1.5788, "grad_norm": 0.4155750572681427, "learning_rate": 0.0002, "epoch": 2.102177554438861, "step": 2510}, {"loss": 1.598, "grad_norm": 0.39257505536079407, "learning_rate": 0.0002, "epoch": 2.1105527638190953, "step": 2520}, {"loss": 1.65, "grad_norm": 0.4156777560710907, "learning_rate": 0.0002, "epoch": 2.11892797319933, "step": 2530}, {"loss": 1.6695, "grad_norm": 0.4025181233882904, "learning_rate": 0.0002, "epoch": 2.1273031825795643, "step": 2540}, {"loss": 1.6471, "grad_norm": 0.42347562313079834, "learning_rate": 0.0002, "epoch": 2.135678391959799, "step": 2550}, {"loss": 1.6014, "grad_norm": 0.47068294882774353, "learning_rate": 0.0002, "epoch": 2.1440536013400333, "step": 2560}, {"loss": 1.6468, "grad_norm": 0.44081777334213257, "learning_rate": 0.0002, "epoch": 2.152428810720268, "step": 2570}, {"loss": 1.641, "grad_norm": 0.44823798537254333, "learning_rate": 0.0002, "epoch": 2.1608040201005023, "step": 2580}, {"loss": 1.6287, "grad_norm": 0.40486326813697815, "learning_rate": 0.0002, "epoch": 2.169179229480737, "step": 2590}, {"loss": 1.6198, "grad_norm": 0.454236775636673, "learning_rate": 0.0002, "epoch": 2.1775544388609713, "step": 2600}, {"loss": 1.5885, "grad_norm": 0.42555344104766846, "learning_rate": 0.0002, "epoch": 2.185929648241206, "step": 2610}, {"loss": 1.6348, "grad_norm": 0.5607381463050842, "learning_rate": 0.0002, "epoch": 2.1943048576214403, "step": 2620}, {"loss": 1.6343, "grad_norm": 0.4095611870288849, "learning_rate": 0.0002, "epoch": 2.202680067001675, "step": 2630}, {"loss": 1.5584, "grad_norm": 0.419342577457428, "learning_rate": 0.0002, "epoch": 2.2110552763819094, "step": 2640}, {"loss": 1.5425, "grad_norm": 0.48541849851608276, "learning_rate": 0.0002, "epoch": 2.219430485762144, "step": 2650}, {"loss": 1.6233, "grad_norm": 0.4365246891975403, "learning_rate": 0.0002, "epoch": 2.2278056951423784, "step": 2660}, {"loss": 1.6886, "grad_norm": 0.46417000889778137, "learning_rate": 0.0002, "epoch": 2.236180904522613, "step": 2670}, {"loss": 1.6345, "grad_norm": 0.5034580230712891, "learning_rate": 0.0002, "epoch": 2.2445561139028474, "step": 2680}, {"loss": 1.5992, "grad_norm": 0.44852879643440247, "learning_rate": 0.0002, "epoch": 2.2529313232830823, "step": 2690}, {"loss": 1.6152, "grad_norm": 0.43886998295783997, "learning_rate": 0.0002, "epoch": 2.2613065326633164, "step": 2700}, {"loss": 1.6533, "grad_norm": 0.45762625336647034, "learning_rate": 0.0002, "epoch": 2.2696817420435513, "step": 2710}, {"loss": 1.5889, "grad_norm": 0.39429017901420593, "learning_rate": 0.0002, "epoch": 2.2780569514237854, "step": 2720}, {"loss": 1.6419, "grad_norm": 0.4420442581176758, "learning_rate": 0.0002, "epoch": 2.2864321608040203, "step": 2730}, {"loss": 1.6126, "grad_norm": 0.4327794015407562, "learning_rate": 0.0002, "epoch": 2.2948073701842544, "step": 2740}, {"loss": 1.6405, "grad_norm": 0.4303780198097229, "learning_rate": 0.0002, "epoch": 2.3031825795644894, "step": 2750}, {"loss": 1.6362, "grad_norm": 0.41379377245903015, "learning_rate": 0.0002, "epoch": 2.3115577889447234, "step": 2760}, {"loss": 1.6744, "grad_norm": 0.4821205735206604, "learning_rate": 0.0002, "epoch": 2.3199329983249584, "step": 2770}, {"loss": 1.6694, "grad_norm": 0.46232181787490845, "learning_rate": 0.0002, "epoch": 2.3283082077051924, "step": 2780}, {"loss": 1.6341, "grad_norm": 0.44937554001808167, "learning_rate": 0.0002, "epoch": 2.3366834170854274, "step": 2790}, {"loss": 1.6556, "grad_norm": 0.443250447511673, "learning_rate": 0.0002, "epoch": 2.3450586264656614, "step": 2800}, {"loss": 1.6874, "grad_norm": 0.4687805473804474, "learning_rate": 0.0002, "epoch": 2.3534338358458964, "step": 2810}, {"loss": 1.6445, "grad_norm": 0.435031920671463, "learning_rate": 0.0002, "epoch": 2.3618090452261304, "step": 2820}, {"loss": 1.6335, "grad_norm": 0.4949858784675598, "learning_rate": 0.0002, "epoch": 2.3701842546063654, "step": 2830}, {"loss": 1.6803, "grad_norm": 0.46349018812179565, "learning_rate": 0.0002, "epoch": 2.3785594639865995, "step": 2840}, {"loss": 1.6586, "grad_norm": 0.46377238631248474, "learning_rate": 0.0002, "epoch": 2.3869346733668344, "step": 2850}, {"loss": 1.5384, "grad_norm": 0.6111940741539001, "learning_rate": 0.0002, "epoch": 2.3953098827470685, "step": 2860}, {"loss": 1.6132, "grad_norm": 0.45090532302856445, "learning_rate": 0.0002, "epoch": 2.4036850921273034, "step": 2870}, {"loss": 1.6047, "grad_norm": 0.4762120842933655, "learning_rate": 0.0002, "epoch": 2.4120603015075375, "step": 2880}, {"loss": 1.6997, "grad_norm": 0.4397919774055481, "learning_rate": 0.0002, "epoch": 2.4204355108877724, "step": 2890}, {"loss": 1.6369, "grad_norm": 0.4765152335166931, "learning_rate": 0.0002, "epoch": 2.4288107202680065, "step": 2900}, {"loss": 1.5982, "grad_norm": 0.4347304403781891, "learning_rate": 0.0002, "epoch": 2.4371859296482414, "step": 2910}, {"loss": 1.6409, "grad_norm": 0.3918324410915375, "learning_rate": 0.0002, "epoch": 2.4455611390284755, "step": 2920}, {"loss": 1.5354, "grad_norm": 0.43932855129241943, "learning_rate": 0.0002, "epoch": 2.4539363484087104, "step": 2930}, {"loss": 1.6283, "grad_norm": 0.46946918964385986, "learning_rate": 0.0002, "epoch": 2.4623115577889445, "step": 2940}, {"loss": 1.6622, "grad_norm": 0.45169174671173096, "learning_rate": 0.0002, "epoch": 2.4706867671691795, "step": 2950}, {"loss": 1.6386, "grad_norm": 0.43488186597824097, "learning_rate": 0.0002, "epoch": 2.4790619765494135, "step": 2960}, {"loss": 1.6187, "grad_norm": 0.42297765612602234, "learning_rate": 0.0002, "epoch": 2.4874371859296485, "step": 2970}, {"loss": 1.5708, "grad_norm": 0.4546392560005188, "learning_rate": 0.0002, "epoch": 2.4958123953098825, "step": 2980}, {"loss": 1.5944, "grad_norm": 0.4236692488193512, "learning_rate": 0.0002, "epoch": 2.5041876046901175, "step": 2990}, {"loss": 1.6927, "grad_norm": 0.46421024203300476, "learning_rate": 0.0002, "epoch": 2.5125628140703515, "step": 3000}, {"loss": 1.6686, "grad_norm": 0.5040220618247986, "learning_rate": 0.0002, "epoch": 2.5209380234505865, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4596138894557953, "learning_rate": 0.0002, "epoch": 2.5293132328308205, "step": 3020}, {"loss": 1.5936, "grad_norm": 0.4410228729248047, "learning_rate": 0.0002, "epoch": 2.5376884422110555, "step": 3030}, {"loss": 1.6336, "grad_norm": 0.553693413734436, "learning_rate": 0.0002, "epoch": 2.5460636515912896, "step": 3040}, {"loss": 1.6377, "grad_norm": 0.41298043727874756, "learning_rate": 0.0002, "epoch": 2.5544388609715245, "step": 3050}, {"loss": 1.7196, "grad_norm": 0.4894513487815857, "learning_rate": 0.0002, "epoch": 2.5628140703517586, "step": 3060}, {"loss": 1.6106, "grad_norm": 0.5525603294372559, "learning_rate": 0.0002, "epoch": 2.5711892797319935, "step": 3070}, {"loss": 1.6089, "grad_norm": 0.5043630003929138, "learning_rate": 0.0002, "epoch": 2.5795644891122276, "step": 3080}, {"loss": 1.5641, "grad_norm": 0.4690920412540436, "learning_rate": 0.0002, "epoch": 2.5879396984924625, "step": 3090}, {"loss": 1.6364, "grad_norm": 0.4358677566051483, "learning_rate": 0.0002, "epoch": 2.5963149078726966, "step": 3100}, {"loss": 1.6328, "grad_norm": 0.4621894061565399, "learning_rate": 0.0002, "epoch": 2.6046901172529315, "step": 3110}, {"loss": 1.7426, "grad_norm": 0.4639507532119751, "learning_rate": 0.0002, "epoch": 2.6130653266331656, "step": 3120}, {"loss": 1.6492, "grad_norm": 0.45161309838294983, "learning_rate": 0.0002, "epoch": 2.6214405360134005, "step": 3130}, {"loss": 1.6221, "grad_norm": 0.49179261922836304, "learning_rate": 0.0002, "epoch": 2.6298157453936346, "step": 3140}, {"loss": 1.663, "grad_norm": 0.4739720821380615, "learning_rate": 0.0002, "epoch": 2.6381909547738696, "step": 3150}, {"loss": 1.616, "grad_norm": 0.468252956867218, "learning_rate": 0.0002, "epoch": 2.6465661641541036, "step": 3160}, {"loss": 1.705, "grad_norm": 0.44691553711891174, "learning_rate": 0.0002, "epoch": 2.6549413735343386, "step": 3170}, {"loss": 1.6558, "grad_norm": 0.47537046670913696, "learning_rate": 0.0002, "epoch": 2.6633165829145726, "step": 3180}, {"loss": 1.6755, "grad_norm": 0.4445202052593231, "learning_rate": 0.0002, "epoch": 2.6716917922948076, "step": 3190}, {"loss": 1.6522, "grad_norm": 0.46785518527030945, "learning_rate": 0.0002, "epoch": 2.6800670016750416, "step": 3200}, {"loss": 1.6711, "grad_norm": 0.4807088077068329, "learning_rate": 0.0002, "epoch": 2.6884422110552766, "step": 3210}, {"loss": 1.6385, "grad_norm": 0.4547516703605652, "learning_rate": 0.0002, "epoch": 2.6968174204355106, "step": 3220}, {"loss": 1.6084, "grad_norm": 0.5200821161270142, "learning_rate": 0.0002, "epoch": 2.7051926298157456, "step": 3230}, {"loss": 1.6434, "grad_norm": 0.4915551245212555, "learning_rate": 0.0002, "epoch": 2.7135678391959797, "step": 3240}, {"loss": 1.6146, "grad_norm": 0.4324817955493927, "learning_rate": 0.0002, "epoch": 2.7219430485762146, "step": 3250}, {"loss": 1.6154, "grad_norm": 0.6290464997291565, "learning_rate": 0.0002, "epoch": 2.7303182579564487, "step": 3260}, {"loss": 1.611, "grad_norm": 0.42255541682243347, "learning_rate": 0.0002, "epoch": 2.7386934673366836, "step": 3270}, {"loss": 1.6345, "grad_norm": 0.47089505195617676, "learning_rate": 0.0002, "epoch": 2.7470686767169177, "step": 3280}, {"loss": 1.6357, "grad_norm": 0.4492960572242737, "learning_rate": 0.0002, "epoch": 2.7554438860971526, "step": 3290}, {"loss": 1.652, "grad_norm": 0.4711938202381134, "learning_rate": 0.0002, "epoch": 2.7638190954773867, "step": 3300}, {"loss": 1.6107, "grad_norm": 0.4635316729545593, "learning_rate": 0.0002, "epoch": 2.7721943048576216, "step": 3310}, {"loss": 1.6044, "grad_norm": 0.4207742512226105, "learning_rate": 0.0002, "epoch": 2.7805695142378557, "step": 3320}, {"loss": 1.6163, "grad_norm": 0.5545504093170166, "learning_rate": 0.0002, "epoch": 2.7889447236180906, "step": 3330}, {"loss": 1.6642, "grad_norm": 0.46976953744888306, "learning_rate": 0.0002, "epoch": 2.7973199329983247, "step": 3340}, {"loss": 1.6879, "grad_norm": 0.4805937111377716, "learning_rate": 0.0002, "epoch": 2.8056951423785597, "step": 3350}, {"loss": 1.6185, "grad_norm": 0.4986467659473419, "learning_rate": 0.0002, "epoch": 2.8140703517587937, "step": 3360}, {"loss": 1.6125, "grad_norm": 0.44702932238578796, "learning_rate": 0.0002, "epoch": 2.8224455611390287, "step": 3370}, {"loss": 1.6318, "grad_norm": 0.4698854088783264, "learning_rate": 0.0002, "epoch": 2.8308207705192627, "step": 3380}, {"loss": 1.6468, "grad_norm": 0.5756528377532959, "learning_rate": 0.0002, "epoch": 2.8391959798994977, "step": 3390}, {"loss": 1.6783, "grad_norm": 0.4266531765460968, "learning_rate": 0.0002, "epoch": 2.8475711892797317, "step": 3400}, {"loss": 1.6351, "grad_norm": 0.5342442989349365, "learning_rate": 0.0002, "epoch": 2.8559463986599667, "step": 3410}, {"loss": 1.659, "grad_norm": 0.47210443019866943, "learning_rate": 0.0002, "epoch": 2.8643216080402008, "step": 3420}, {"loss": 1.6157, "grad_norm": 0.4491795599460602, "learning_rate": 0.0002, "epoch": 2.8726968174204357, "step": 3430}, {"loss": 1.6179, "grad_norm": 0.5387647151947021, "learning_rate": 0.0002, "epoch": 2.8810720268006698, "step": 3440}, {"loss": 1.6415, "grad_norm": 0.5059208273887634, "learning_rate": 0.0002, "epoch": 2.8894472361809047, "step": 3450}, {"loss": 1.6577, "grad_norm": 0.472605437040329, "learning_rate": 0.0002, "epoch": 2.8978224455611388, "step": 3460}, {"loss": 1.6831, "grad_norm": 0.499795138835907, "learning_rate": 0.0002, "epoch": 2.9061976549413737, "step": 3470}, {"loss": 1.6198, "grad_norm": 0.4887969493865967, "learning_rate": 0.0002, "epoch": 2.914572864321608, "step": 3480}, {"loss": 1.5951, "grad_norm": 0.4670022130012512, "learning_rate": 0.0002, "epoch": 2.9229480737018427, "step": 3490}, {"loss": 1.6355, "grad_norm": 0.4475444555282593, "learning_rate": 0.0002, "epoch": 2.931323283082077, "step": 3500}, {"loss": 1.6669, "grad_norm": 0.39244669675827026, "learning_rate": 0.0002, "epoch": 2.9396984924623117, "step": 3510}, {"loss": 1.6094, "grad_norm": 0.4905056059360504, "learning_rate": 0.0002, "epoch": 2.948073701842546, "step": 3520}, {"loss": 1.5774, "grad_norm": 0.4395551085472107, "learning_rate": 0.0002, "epoch": 2.9564489112227808, "step": 3530}, {"loss": 1.6047, "grad_norm": 0.4693661034107208, "learning_rate": 0.0002, "epoch": 2.964824120603015, "step": 3540}, {"loss": 1.648, "grad_norm": 0.473781943321228, "learning_rate": 0.0002, "epoch": 2.9731993299832498, "step": 3550}, {"loss": 1.7056, "grad_norm": 0.4374050796031952, "learning_rate": 0.0002, "epoch": 2.981574539363484, "step": 3560}, {"loss": 1.6816, "grad_norm": 0.46144190430641174, "learning_rate": 0.0002, "epoch": 2.9899497487437188, "step": 3570}, {"loss": 1.5454, "grad_norm": 0.43887680768966675, "learning_rate": 0.0002, "epoch": 2.998324958123953, "step": 3580}, {"eval_loss": 1.8283122777938843, "eval_runtime": 38.023, "eval_samples_per_second": 13.544, "eval_steps_per_second": 1.709, "epoch": 3.0, "step": 3582}, {"loss": 1.5874, "grad_norm": 0.6784713268280029, "learning_rate": 0.0002, "epoch": 3.006700167504188, "step": 3590}, {"loss": 1.5813, "grad_norm": 0.5783940553665161, "learning_rate": 0.0002, "epoch": 3.0150753768844223, "step": 3600}, {"loss": 1.4769, "grad_norm": 0.5408937335014343, "learning_rate": 0.0002, "epoch": 3.023450586264657, "step": 3610}, {"loss": 1.526, "grad_norm": 0.5229013562202454, "learning_rate": 0.0002, "epoch": 3.0318257956448913, "step": 3620}, {"loss": 1.4835, "grad_norm": 0.49160143733024597, "learning_rate": 0.0002, "epoch": 3.040201005025126, "step": 3630}, {"loss": 1.5398, "grad_norm": 0.6563201546669006, "learning_rate": 0.0002, "epoch": 3.0485762144053603, "step": 3640}, {"loss": 1.448, "grad_norm": 0.5686020851135254, "learning_rate": 0.0002, "epoch": 3.056951423785595, "step": 3650}, {"loss": 1.4541, "grad_norm": 0.5774043202400208, "learning_rate": 0.0002, "epoch": 3.0653266331658293, "step": 3660}, {"loss": 1.4734, "grad_norm": 0.6106171011924744, "learning_rate": 0.0002, "epoch": 3.073701842546064, "step": 3670}, {"loss": 1.4961, "grad_norm": 0.517433226108551, "learning_rate": 0.0002, "epoch": 3.0820770519262983, "step": 3680}, {"loss": 1.4961, "grad_norm": 0.5681702494621277, "learning_rate": 0.0002, "epoch": 3.090452261306533, "step": 3690}, {"loss": 1.4731, "grad_norm": 0.5769233107566833, "learning_rate": 0.0002, "epoch": 3.0988274706867673, "step": 3700}, {"loss": 1.4836, "grad_norm": 0.5657462477684021, "learning_rate": 0.0002, "epoch": 3.107202680067002, "step": 3710}, {"loss": 1.4526, "grad_norm": 0.6035246253013611, "learning_rate": 0.0002, "epoch": 3.1155778894472363, "step": 3720}, {"loss": 1.5102, "grad_norm": 0.7286643385887146, "learning_rate": 0.0002, "epoch": 3.123953098827471, "step": 3730}, {"loss": 1.4444, "grad_norm": 0.5121201872825623, "learning_rate": 0.0002, "epoch": 3.1323283082077054, "step": 3740}, {"loss": 1.565, "grad_norm": 0.5074213147163391, "learning_rate": 0.0002, "epoch": 3.14070351758794, "step": 3750}, {"loss": 1.4729, "grad_norm": 0.57481849193573, "learning_rate": 0.0002, "epoch": 3.1490787269681744, "step": 3760}, {"loss": 1.4765, "grad_norm": 0.6326663494110107, "learning_rate": 0.0002, "epoch": 3.157453936348409, "step": 3770}, {"loss": 1.4888, "grad_norm": 0.6039315462112427, "learning_rate": 0.0002, "epoch": 3.1658291457286434, "step": 3780}, {"loss": 1.5084, "grad_norm": 0.6936715245246887, "learning_rate": 0.0002, "epoch": 3.174204355108878, "step": 3790}, {"loss": 1.4879, "grad_norm": 0.6516796946525574, "learning_rate": 0.0002, "epoch": 3.1825795644891124, "step": 3800}, {"loss": 1.578, "grad_norm": 0.6140730977058411, "learning_rate": 0.0002, "epoch": 3.190954773869347, "step": 3810}, {"loss": 1.5101, "grad_norm": 0.631328284740448, "learning_rate": 0.0002, "epoch": 3.1993299832495814, "step": 3820}, {"loss": 1.4844, "grad_norm": 0.6265402436256409, "learning_rate": 0.0002, "epoch": 3.207705192629816, "step": 3830}, {"loss": 1.5332, "grad_norm": 0.6649428606033325, "learning_rate": 0.0002, "epoch": 3.2160804020100504, "step": 3840}, {"loss": 1.5231, "grad_norm": 0.5329259634017944, "learning_rate": 0.0002, "epoch": 3.224455611390285, "step": 3850}, {"loss": 1.5714, "grad_norm": 0.6008304953575134, "learning_rate": 0.0002, "epoch": 3.2328308207705194, "step": 3860}, {"loss": 1.5214, "grad_norm": 0.5918582081794739, "learning_rate": 0.0002, "epoch": 3.241206030150754, "step": 3870}, {"loss": 1.571, "grad_norm": 0.643622100353241, "learning_rate": 0.0002, "epoch": 3.2495812395309884, "step": 3880}, {"loss": 1.5274, "grad_norm": 0.5517964363098145, "learning_rate": 0.0002, "epoch": 3.257956448911223, "step": 3890}, {"loss": 1.5458, "grad_norm": 0.6780755519866943, "learning_rate": 0.0002, "epoch": 3.2663316582914574, "step": 3900}, {"loss": 1.5743, "grad_norm": 0.6742202639579773, "learning_rate": 0.0002, "epoch": 3.274706867671692, "step": 3910}, {"loss": 1.5279, "grad_norm": 0.6228749752044678, "learning_rate": 0.0002, "epoch": 3.2830820770519265, "step": 3920}, {"loss": 1.4899, "grad_norm": 0.5836303234100342, "learning_rate": 0.0002, "epoch": 3.291457286432161, "step": 3930}, {"loss": 1.5445, "grad_norm": 0.6337724328041077, "learning_rate": 0.0002, "epoch": 3.2998324958123955, "step": 3940}, {"loss": 1.5618, "grad_norm": 0.6345084309577942, "learning_rate": 0.0002, "epoch": 3.30820770519263, "step": 3950}, {"loss": 1.4224, "grad_norm": 0.6125303506851196, "learning_rate": 0.0002, "epoch": 3.3165829145728645, "step": 3960}, {"loss": 1.5355, "grad_norm": 0.6259911060333252, "learning_rate": 0.0002, "epoch": 3.324958123953099, "step": 3970}, {"loss": 1.5427, "grad_norm": 0.645745575428009, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 3980}, {"loss": 1.5817, "grad_norm": 0.6666176915168762, "learning_rate": 0.0002, "epoch": 3.341708542713568, "step": 3990}, {"loss": 1.4998, "grad_norm": 0.59013831615448, "learning_rate": 0.0002, "epoch": 3.3500837520938025, "step": 4000}, {"loss": 1.4921, "grad_norm": 0.6604634523391724, "learning_rate": 0.0002, "epoch": 3.358458961474037, "step": 4010}, {"loss": 1.5076, "grad_norm": 0.6676120758056641, "learning_rate": 0.0002, "epoch": 3.3668341708542715, "step": 4020}, {"loss": 1.4801, "grad_norm": 0.515724778175354, "learning_rate": 0.0002, "epoch": 3.375209380234506, "step": 4030}, {"loss": 1.4932, "grad_norm": 0.681968092918396, "learning_rate": 0.0002, "epoch": 3.3835845896147405, "step": 4040}, {"loss": 1.5148, "grad_norm": 0.5978158116340637, "learning_rate": 0.0002, "epoch": 3.391959798994975, "step": 4050}, {"loss": 1.5449, "grad_norm": 0.6043432354927063, "learning_rate": 0.0002, "epoch": 3.4003350083752095, "step": 4060}, {"loss": 1.5021, "grad_norm": 0.5899770855903625, "learning_rate": 0.0002, "epoch": 3.408710217755444, "step": 4070}, {"loss": 1.5992, "grad_norm": 0.6014242172241211, "learning_rate": 0.0002, "epoch": 3.4170854271356785, "step": 4080}, {"loss": 1.4692, "grad_norm": 0.5944811105728149, "learning_rate": 0.0002, "epoch": 3.425460636515913, "step": 4090}, {"loss": 1.5877, "grad_norm": 0.6506822109222412, "learning_rate": 0.0002, "epoch": 3.4338358458961475, "step": 4100}, {"loss": 1.5144, "grad_norm": 0.6926528811454773, "learning_rate": 0.0002, "epoch": 3.442211055276382, "step": 4110}, {"loss": 1.5169, "grad_norm": 0.5646378993988037, "learning_rate": 0.0002, "epoch": 3.4505862646566166, "step": 4120}, {"loss": 1.5032, "grad_norm": 0.7233654856681824, "learning_rate": 0.0002, "epoch": 3.458961474036851, "step": 4130}, {"loss": 1.5161, "grad_norm": 0.6231815814971924, "learning_rate": 0.0002, "epoch": 3.4673366834170856, "step": 4140}, {"loss": 1.5349, "grad_norm": 0.6115689873695374, "learning_rate": 0.0002, "epoch": 3.47571189279732, "step": 4150}, {"loss": 1.4621, "grad_norm": 0.5812674760818481, "learning_rate": 0.0002, "epoch": 3.4840871021775546, "step": 4160}, {"loss": 1.5465, "grad_norm": 0.6099632978439331, "learning_rate": 0.0002, "epoch": 3.492462311557789, "step": 4170}, {"loss": 1.4795, "grad_norm": 0.6102647185325623, "learning_rate": 0.0002, "epoch": 3.5008375209380236, "step": 4180}, {"loss": 1.5305, "grad_norm": 0.6034680008888245, "learning_rate": 0.0002, "epoch": 3.509212730318258, "step": 4190}, {"loss": 1.5093, "grad_norm": 0.6281666159629822, "learning_rate": 0.0002, "epoch": 3.5175879396984926, "step": 4200}, {"loss": 1.4903, "grad_norm": 0.6245372295379639, "learning_rate": 0.0002, "epoch": 3.525963149078727, "step": 4210}, {"loss": 1.5098, "grad_norm": 0.5897293090820312, "learning_rate": 0.0002, "epoch": 3.5343383584589616, "step": 4220}, {"loss": 1.5991, "grad_norm": 0.601054847240448, "learning_rate": 0.0002, "epoch": 3.542713567839196, "step": 4230}, {"loss": 1.4974, "grad_norm": 0.7004473805427551, "learning_rate": 0.0002, "epoch": 3.5510887772194306, "step": 4240}, {"loss": 1.5993, "grad_norm": 0.6601553559303284, "learning_rate": 0.0002, "epoch": 3.559463986599665, "step": 4250}, {"loss": 1.4961, "grad_norm": 0.6112467050552368, "learning_rate": 0.0002, "epoch": 3.5678391959798996, "step": 4260}, {"loss": 1.4967, "grad_norm": 0.5902454853057861, "learning_rate": 0.0002, "epoch": 3.576214405360134, "step": 4270}, {"loss": 1.5659, "grad_norm": 0.5792450904846191, "learning_rate": 0.0002, "epoch": 3.5845896147403686, "step": 4280}, {"loss": 1.4664, "grad_norm": 0.5923888087272644, "learning_rate": 0.0002, "epoch": 3.592964824120603, "step": 4290}, {"loss": 1.5155, "grad_norm": 0.5869482159614563, "learning_rate": 0.0002, "epoch": 3.6013400335008376, "step": 4300}, {"loss": 1.5119, "grad_norm": 0.6372929811477661, "learning_rate": 0.0002, "epoch": 3.609715242881072, "step": 4310}, {"loss": 1.4977, "grad_norm": 0.6350686550140381, "learning_rate": 0.0002, "epoch": 3.6180904522613067, "step": 4320}, {"loss": 1.5226, "grad_norm": 0.571819007396698, "learning_rate": 0.0002, "epoch": 3.626465661641541, "step": 4330}, {"loss": 1.5414, "grad_norm": 0.592250645160675, "learning_rate": 0.0002, "epoch": 3.6348408710217757, "step": 4340}, {"loss": 1.4912, "grad_norm": 0.6110650897026062, "learning_rate": 0.0002, "epoch": 3.64321608040201, "step": 4350}, {"loss": 1.6089, "grad_norm": 0.6187081336975098, "learning_rate": 0.0002, "epoch": 3.6515912897822447, "step": 4360}, {"loss": 1.5345, "grad_norm": 0.6197671890258789, "learning_rate": 0.0002, "epoch": 3.659966499162479, "step": 4370}, {"loss": 1.4988, "grad_norm": 0.6050862669944763, "learning_rate": 0.0002, "epoch": 3.6683417085427137, "step": 4380}, {"loss": 1.4872, "grad_norm": 0.621265172958374, "learning_rate": 0.0002, "epoch": 3.676716917922948, "step": 4390}, {"loss": 1.6011, "grad_norm": 0.6552940011024475, "learning_rate": 0.0002, "epoch": 3.6850921273031827, "step": 4400}, {"loss": 1.4344, "grad_norm": 0.5638861060142517, "learning_rate": 0.0002, "epoch": 3.693467336683417, "step": 4410}, {"loss": 1.4985, "grad_norm": 0.6388863325119019, "learning_rate": 0.0002, "epoch": 3.7018425460636517, "step": 4420}, {"loss": 1.3696, "grad_norm": 0.6062559485435486, "learning_rate": 0.0002, "epoch": 3.710217755443886, "step": 4430}, {"loss": 1.5101, "grad_norm": 0.5800350308418274, "learning_rate": 0.0002, "epoch": 3.7185929648241207, "step": 4440}, {"loss": 1.5286, "grad_norm": 0.5954474210739136, "learning_rate": 0.0002, "epoch": 3.726968174204355, "step": 4450}, {"loss": 1.6133, "grad_norm": 0.5880125761032104, "learning_rate": 0.0002, "epoch": 3.7353433835845897, "step": 4460}, {"loss": 1.5055, "grad_norm": 0.5880921483039856, "learning_rate": 0.0002, "epoch": 3.7437185929648242, "step": 4470}, {"loss": 1.5728, "grad_norm": 0.5995073914527893, "learning_rate": 0.0002, "epoch": 3.7520938023450587, "step": 4480}, {"loss": 1.554, "grad_norm": 0.5958493947982788, "learning_rate": 0.0002, "epoch": 3.7604690117252932, "step": 4490}, {"loss": 1.5472, "grad_norm": 0.5694711804389954, "learning_rate": 0.0002, "epoch": 3.7688442211055277, "step": 4500}, {"loss": 1.5105, "grad_norm": 0.6175141930580139, "learning_rate": 0.0002, "epoch": 3.7772194304857623, "step": 4510}, {"loss": 1.5404, "grad_norm": 0.5541581511497498, "learning_rate": 0.0002, "epoch": 3.7855946398659968, "step": 4520}, {"loss": 1.5283, "grad_norm": 0.5986164808273315, "learning_rate": 0.0002, "epoch": 3.7939698492462313, "step": 4530}, {"loss": 1.4961, "grad_norm": 0.640072226524353, "learning_rate": 0.0002, "epoch": 3.8023450586264658, "step": 4540}, {"loss": 1.5297, "grad_norm": 0.5742579698562622, "learning_rate": 0.0002, "epoch": 3.8107202680067003, "step": 4550}, {"loss": 1.5591, "grad_norm": 0.6658656001091003, "learning_rate": 0.0002, "epoch": 3.819095477386935, "step": 4560}, {"loss": 1.4992, "grad_norm": 0.5475369691848755, "learning_rate": 0.0002, "epoch": 3.8274706867671693, "step": 4570}, {"loss": 1.5966, "grad_norm": 0.613172173500061, "learning_rate": 0.0002, "epoch": 3.835845896147404, "step": 4580}, {"loss": 1.5594, "grad_norm": 0.590968132019043, "learning_rate": 0.0002, "epoch": 3.8442211055276383, "step": 4590}, {"loss": 1.5067, "grad_norm": 0.5865461826324463, "learning_rate": 0.0002, "epoch": 3.852596314907873, "step": 4600}, {"loss": 1.5247, "grad_norm": 0.6815178990364075, "learning_rate": 0.0002, "epoch": 3.8609715242881073, "step": 4610}, {"loss": 1.5702, "grad_norm": 0.6551400423049927, "learning_rate": 0.0002, "epoch": 3.869346733668342, "step": 4620}, {"loss": 1.4891, "grad_norm": 0.6398897171020508, "learning_rate": 0.0002, "epoch": 3.8777219430485763, "step": 4630}, {"loss": 1.5353, "grad_norm": 0.6761762499809265, "learning_rate": 0.0002, "epoch": 3.886097152428811, "step": 4640}, {"loss": 1.6071, "grad_norm": 0.6277294754981995, "learning_rate": 0.0002, "epoch": 3.8944723618090453, "step": 4650}, {"loss": 1.5605, "grad_norm": 0.6285301446914673, "learning_rate": 0.0002, "epoch": 3.90284757118928, "step": 4660}, {"loss": 1.5937, "grad_norm": 0.5416069626808167, "learning_rate": 0.0002, "epoch": 3.9112227805695143, "step": 4670}, {"loss": 1.5461, "grad_norm": 0.6314545273780823, "learning_rate": 0.0002, "epoch": 3.919597989949749, "step": 4680}, {"loss": 1.4828, "grad_norm": 0.604479968547821, "learning_rate": 0.0002, "epoch": 3.9279731993299833, "step": 4690}, {"loss": 1.5186, "grad_norm": 0.5321660041809082, "learning_rate": 0.0002, "epoch": 3.936348408710218, "step": 4700}, {"loss": 1.4696, "grad_norm": 0.6632516980171204, "learning_rate": 0.0002, "epoch": 3.9447236180904524, "step": 4710}, {"loss": 1.519, "grad_norm": 0.5925896763801575, "learning_rate": 0.0002, "epoch": 3.953098827470687, "step": 4720}, {"loss": 1.5716, "grad_norm": 0.6580308675765991, "learning_rate": 0.0002, "epoch": 3.9614740368509214, "step": 4730}, {"loss": 1.4462, "grad_norm": 0.5578170418739319, "learning_rate": 0.0002, "epoch": 3.969849246231156, "step": 4740}, {"loss": 1.5394, "grad_norm": 0.6216608285903931, "learning_rate": 0.0002, "epoch": 3.9782244556113904, "step": 4750}, {"loss": 1.5395, "grad_norm": 0.5693069696426392, "learning_rate": 0.0002, "epoch": 3.986599664991625, "step": 4760}, {"loss": 1.5517, "grad_norm": 0.5353434681892395, "learning_rate": 0.0002, "epoch": 3.9949748743718594, "step": 4770}, {"eval_loss": 1.8809821605682373, "eval_runtime": 37.9695, "eval_samples_per_second": 13.564, "eval_steps_per_second": 1.712, "epoch": 4.0, "step": 4776}, {"loss": 1.4608, "grad_norm": 0.6117817759513855, "learning_rate": 0.0002, "epoch": 4.0033500837520934, "step": 4780}, {"loss": 1.2982, "grad_norm": 0.6816073656082153, "learning_rate": 0.0002, "epoch": 4.011725293132328, "step": 4790}, {"loss": 1.3464, "grad_norm": 0.715548038482666, "learning_rate": 0.0002, "epoch": 4.0201005025125625, "step": 4800}, {"loss": 1.3918, "grad_norm": 0.8585814833641052, "learning_rate": 0.0002, "epoch": 4.028475711892797, "step": 4810}, {"loss": 1.4137, "grad_norm": 0.7372158765792847, "learning_rate": 0.0002, "epoch": 4.0368509212730315, "step": 4820}, {"loss": 1.3769, "grad_norm": 0.8915117979049683, "learning_rate": 0.0002, "epoch": 4.045226130653266, "step": 4830}, {"loss": 1.3551, "grad_norm": 0.9323588013648987, "learning_rate": 0.0002, "epoch": 4.0536013400335005, "step": 4840}, {"loss": 1.3687, "grad_norm": 0.9298437237739563, "learning_rate": 0.0002, "epoch": 4.061976549413735, "step": 4850}, {"loss": 1.4173, "grad_norm": 0.8541792035102844, "learning_rate": 0.0002, "epoch": 4.0703517587939695, "step": 4860}, {"loss": 1.3668, "grad_norm": 0.7833571434020996, "learning_rate": 0.0002, "epoch": 4.078726968174204, "step": 4870}, {"loss": 1.3835, "grad_norm": 0.9325295090675354, "learning_rate": 0.0002, "epoch": 4.0871021775544385, "step": 4880}, {"loss": 1.3834, "grad_norm": 0.7066370248794556, "learning_rate": 0.0002, "epoch": 4.0954773869346734, "step": 4890}, {"loss": 1.3661, "grad_norm": 0.712640643119812, "learning_rate": 0.0002, "epoch": 4.1038525963149075, "step": 4900}, {"loss": 1.3637, "grad_norm": 0.6970218420028687, "learning_rate": 0.0002, "epoch": 4.1122278056951425, "step": 4910}, {"loss": 1.3805, "grad_norm": 0.7979312539100647, "learning_rate": 0.0002, "epoch": 4.1206030150753765, "step": 4920}, {"loss": 1.4115, "grad_norm": 0.7801558375358582, "learning_rate": 0.0002, "epoch": 4.1289782244556115, "step": 4930}, {"loss": 1.3288, "grad_norm": 0.7505159974098206, "learning_rate": 0.0002, "epoch": 4.1373534338358455, "step": 4940}, {"loss": 1.3453, "grad_norm": 0.738201916217804, "learning_rate": 0.0002, "epoch": 4.1457286432160805, "step": 4950}, {"loss": 1.3418, "grad_norm": 0.7736659049987793, "learning_rate": 0.0002, "epoch": 4.1541038525963145, "step": 4960}, {"loss": 1.3663, "grad_norm": 0.7850064635276794, "learning_rate": 0.0002, "epoch": 4.1624790619765495, "step": 4970}, {"loss": 1.326, "grad_norm": 0.8316620588302612, "learning_rate": 0.0002, "epoch": 4.1708542713567835, "step": 4980}, {"loss": 1.377, "grad_norm": 0.7217330932617188, "learning_rate": 0.0002, "epoch": 4.1792294807370185, "step": 4990}, {"loss": 1.3299, "grad_norm": 0.7050199508666992, "learning_rate": 0.0002, "epoch": 4.187604690117253, "step": 5000}, {"loss": 1.3798, "grad_norm": 0.6992659568786621, "learning_rate": 0.0002, "epoch": 4.1959798994974875, "step": 5010}, {"loss": 1.3391, "grad_norm": 0.7648445963859558, "learning_rate": 0.0002, "epoch": 4.204355108877722, "step": 5020}, {"loss": 1.3339, "grad_norm": 0.8093137741088867, "learning_rate": 0.0002, "epoch": 4.2127303182579565, "step": 5030}, {"loss": 1.37, "grad_norm": 0.6907750368118286, "learning_rate": 0.0002, "epoch": 4.221105527638191, "step": 5040}, {"loss": 1.4231, "grad_norm": 0.7000078558921814, "learning_rate": 0.0002, "epoch": 4.2294807370184255, "step": 5050}, {"loss": 1.3411, "grad_norm": 0.715034008026123, "learning_rate": 0.0002, "epoch": 4.23785594639866, "step": 5060}, {"loss": 1.3795, "grad_norm": 0.828895628452301, "learning_rate": 0.0002, "epoch": 4.2462311557788945, "step": 5070}, {"loss": 1.3397, "grad_norm": 0.7127292156219482, "learning_rate": 0.0002, "epoch": 4.254606365159129, "step": 5080}, {"loss": 1.4255, "grad_norm": 0.8256623148918152, "learning_rate": 0.0002, "epoch": 4.2629815745393635, "step": 5090}, {"loss": 1.4078, "grad_norm": 0.8062452077865601, "learning_rate": 0.0002, "epoch": 4.271356783919598, "step": 5100}, {"loss": 1.3705, "grad_norm": 0.6861081123352051, "learning_rate": 0.0002, "epoch": 4.279731993299833, "step": 5110}, {"loss": 1.3463, "grad_norm": 0.7566041350364685, "learning_rate": 0.0002, "epoch": 4.288107202680067, "step": 5120}, {"loss": 1.4571, "grad_norm": 0.8734753727912903, "learning_rate": 0.0002, "epoch": 4.296482412060302, "step": 5130}, {"loss": 1.4747, "grad_norm": 0.8559320569038391, "learning_rate": 0.0002, "epoch": 4.304857621440536, "step": 5140}, {"loss": 1.3551, "grad_norm": 0.6965576410293579, "learning_rate": 0.0002, "epoch": 4.313232830820771, "step": 5150}, {"loss": 1.3485, "grad_norm": 0.8277813792228699, "learning_rate": 0.0002, "epoch": 4.321608040201005, "step": 5160}, {"loss": 1.3433, "grad_norm": 1.0733633041381836, "learning_rate": 0.0002, "epoch": 4.32998324958124, "step": 5170}, {"loss": 1.3953, "grad_norm": 0.7914809584617615, "learning_rate": 0.0002, "epoch": 4.338358458961474, "step": 5180}, {"loss": 1.3907, "grad_norm": 0.8307849168777466, "learning_rate": 0.0002, "epoch": 4.346733668341709, "step": 5190}, {"loss": 1.4318, "grad_norm": 0.7066516280174255, "learning_rate": 0.0002, "epoch": 4.355108877721943, "step": 5200}, {"loss": 1.3866, "grad_norm": 0.9676792025566101, "learning_rate": 0.0002, "epoch": 4.363484087102178, "step": 5210}, {"loss": 1.3973, "grad_norm": 0.7672301530838013, "learning_rate": 0.0002, "epoch": 4.371859296482412, "step": 5220}, {"loss": 1.3576, "grad_norm": 0.6888260245323181, "learning_rate": 0.0002, "epoch": 4.380234505862647, "step": 5230}, {"loss": 1.3815, "grad_norm": 0.8775295615196228, "learning_rate": 0.0002, "epoch": 4.388609715242881, "step": 5240}, {"loss": 1.3224, "grad_norm": 0.8742642998695374, "learning_rate": 0.0002, "epoch": 4.396984924623116, "step": 5250}, {"loss": 1.4609, "grad_norm": 0.6935433745384216, "learning_rate": 0.0002, "epoch": 4.40536013400335, "step": 5260}, {"loss": 1.3605, "grad_norm": 0.7726178169250488, "learning_rate": 0.0002, "epoch": 4.413735343383585, "step": 5270}, {"loss": 1.4591, "grad_norm": 0.7493860721588135, "learning_rate": 0.0002, "epoch": 4.422110552763819, "step": 5280}, {"loss": 1.3277, "grad_norm": 0.7758517265319824, "learning_rate": 0.0002, "epoch": 4.430485762144054, "step": 5290}, {"loss": 1.2916, "grad_norm": 0.779315173625946, "learning_rate": 0.0002, "epoch": 4.438860971524288, "step": 5300}, {"loss": 1.4483, "grad_norm": 0.7753667235374451, "learning_rate": 0.0002, "epoch": 4.447236180904523, "step": 5310}, {"loss": 1.2513, "grad_norm": 0.8738188743591309, "learning_rate": 0.0002, "epoch": 4.455611390284757, "step": 5320}, {"loss": 1.41, "grad_norm": 0.8410757184028625, "learning_rate": 0.0002, "epoch": 4.463986599664992, "step": 5330}, {"loss": 1.3809, "grad_norm": 0.728897750377655, "learning_rate": 0.0002, "epoch": 4.472361809045226, "step": 5340}, {"loss": 1.4049, "grad_norm": 0.7880531549453735, "learning_rate": 0.0002, "epoch": 4.480737018425461, "step": 5350}, {"loss": 1.4106, "grad_norm": 0.8455142378807068, "learning_rate": 0.0002, "epoch": 4.489112227805695, "step": 5360}, {"loss": 1.431, "grad_norm": 0.8527868986129761, "learning_rate": 0.0002, "epoch": 4.49748743718593, "step": 5370}, {"loss": 1.3586, "grad_norm": 0.7743009328842163, "learning_rate": 0.0002, "epoch": 4.505862646566165, "step": 5380}, {"loss": 1.4175, "grad_norm": 0.7555320858955383, "learning_rate": 0.0002, "epoch": 4.514237855946399, "step": 5390}, {"loss": 1.3433, "grad_norm": 0.8146619200706482, "learning_rate": 0.0002, "epoch": 4.522613065326633, "step": 5400}, {"loss": 1.4859, "grad_norm": 0.8042502999305725, "learning_rate": 0.0002, "epoch": 4.530988274706868, "step": 5410}, {"loss": 1.3843, "grad_norm": 0.7329140305519104, "learning_rate": 0.0002, "epoch": 4.539363484087103, "step": 5420}, {"loss": 1.3946, "grad_norm": 0.7574753165245056, "learning_rate": 0.0002, "epoch": 4.547738693467337, "step": 5430}, {"loss": 1.3048, "grad_norm": 1.1223409175872803, "learning_rate": 0.0002, "epoch": 4.556113902847571, "step": 5440}, {"loss": 1.4067, "grad_norm": 0.7647369503974915, "learning_rate": 0.0002, "epoch": 4.564489112227806, "step": 5450}, {"loss": 1.4569, "grad_norm": 0.9135531187057495, "learning_rate": 0.0002, "epoch": 4.572864321608041, "step": 5460}, {"loss": 1.4813, "grad_norm": 0.9343693852424622, "learning_rate": 0.0002, "epoch": 4.581239530988275, "step": 5470}, {"loss": 1.385, "grad_norm": 0.869945764541626, "learning_rate": 0.0002, "epoch": 4.589614740368509, "step": 5480}, {"loss": 1.4067, "grad_norm": 0.7383785843849182, "learning_rate": 0.0002, "epoch": 4.597989949748744, "step": 5490}, {"loss": 1.3698, "grad_norm": 0.7988699674606323, "learning_rate": 0.0002, "epoch": 4.606365159128979, "step": 5500}, {"loss": 1.3834, "grad_norm": 0.8731256127357483, "learning_rate": 0.0002, "epoch": 4.614740368509213, "step": 5510}, {"loss": 1.4393, "grad_norm": 0.7577664256095886, "learning_rate": 0.0002, "epoch": 4.623115577889447, "step": 5520}, {"loss": 1.4418, "grad_norm": 0.7825039625167847, "learning_rate": 0.0002, "epoch": 4.631490787269682, "step": 5530}, {"loss": 1.4594, "grad_norm": 0.8534902930259705, "learning_rate": 0.0002, "epoch": 4.639865996649917, "step": 5540}, {"loss": 1.3689, "grad_norm": 0.7403318285942078, "learning_rate": 0.0002, "epoch": 4.648241206030151, "step": 5550}, {"loss": 1.4456, "grad_norm": 0.8229990005493164, "learning_rate": 0.0002, "epoch": 4.656616415410385, "step": 5560}, {"loss": 1.3854, "grad_norm": 0.8279513716697693, "learning_rate": 0.0002, "epoch": 4.66499162479062, "step": 5570}, {"loss": 1.4472, "grad_norm": 0.8923851251602173, "learning_rate": 0.0002, "epoch": 4.673366834170855, "step": 5580}, {"loss": 1.3999, "grad_norm": 0.7457540035247803, "learning_rate": 0.0002, "epoch": 4.681742043551089, "step": 5590}, {"loss": 1.4341, "grad_norm": 0.7110715508460999, "learning_rate": 0.0002, "epoch": 4.690117252931323, "step": 5600}, {"loss": 1.4327, "grad_norm": 0.7135499119758606, "learning_rate": 0.0002, "epoch": 4.698492462311558, "step": 5610}, {"loss": 1.4321, "grad_norm": 0.7606837153434753, "learning_rate": 0.0002, "epoch": 4.706867671691793, "step": 5620}, {"loss": 1.3792, "grad_norm": 0.9622916579246521, "learning_rate": 0.0002, "epoch": 4.715242881072027, "step": 5630}, {"loss": 1.4, "grad_norm": 0.7665684819221497, "learning_rate": 0.0002, "epoch": 4.723618090452261, "step": 5640}, {"loss": 1.3837, "grad_norm": 0.7985475659370422, "learning_rate": 0.0002, "epoch": 4.731993299832496, "step": 5650}, {"loss": 1.397, "grad_norm": 0.9179279208183289, "learning_rate": 0.0002, "epoch": 4.740368509212731, "step": 5660}, {"loss": 1.4379, "grad_norm": 0.8311634063720703, "learning_rate": 0.0002, "epoch": 4.748743718592965, "step": 5670}, {"loss": 1.3546, "grad_norm": 0.7773269414901733, "learning_rate": 0.0002, "epoch": 4.757118927973199, "step": 5680}, {"loss": 1.4031, "grad_norm": 0.7771748900413513, "learning_rate": 0.0002, "epoch": 4.765494137353434, "step": 5690}, {"loss": 1.3724, "grad_norm": 0.7518507242202759, "learning_rate": 0.0002, "epoch": 4.773869346733669, "step": 5700}, {"loss": 1.3247, "grad_norm": 0.7699326276779175, "learning_rate": 0.0002, "epoch": 4.782244556113903, "step": 5710}, {"loss": 1.437, "grad_norm": 0.7001115679740906, "learning_rate": 0.0002, "epoch": 4.790619765494137, "step": 5720}, {"loss": 1.4257, "grad_norm": 0.7220682501792908, "learning_rate": 0.0002, "epoch": 4.798994974874372, "step": 5730}, {"loss": 1.4174, "grad_norm": 0.7654005289077759, "learning_rate": 0.0002, "epoch": 4.807370184254607, "step": 5740}, {"loss": 1.3792, "grad_norm": 0.8132795095443726, "learning_rate": 0.0002, "epoch": 4.815745393634841, "step": 5750}, {"loss": 1.4007, "grad_norm": 0.7105404138565063, "learning_rate": 0.0002, "epoch": 4.824120603015075, "step": 5760}, {"loss": 1.4289, "grad_norm": 0.9346209764480591, "learning_rate": 0.0002, "epoch": 4.83249581239531, "step": 5770}, {"loss": 1.4066, "grad_norm": 1.0075623989105225, "learning_rate": 0.0002, "epoch": 4.840871021775545, "step": 5780}, {"loss": 1.4558, "grad_norm": 0.758376955986023, "learning_rate": 0.0002, "epoch": 4.849246231155779, "step": 5790}, {"loss": 1.4117, "grad_norm": 0.854821503162384, "learning_rate": 0.0002, "epoch": 4.857621440536013, "step": 5800}, {"loss": 1.4014, "grad_norm": 0.8226943016052246, "learning_rate": 0.0002, "epoch": 4.865996649916248, "step": 5810}, {"loss": 1.3963, "grad_norm": 0.7510473728179932, "learning_rate": 0.0002, "epoch": 4.874371859296483, "step": 5820}, {"loss": 1.4463, "grad_norm": 0.7449678182601929, "learning_rate": 0.0002, "epoch": 4.882747068676717, "step": 5830}, {"loss": 1.3691, "grad_norm": 0.7840824723243713, "learning_rate": 0.0002, "epoch": 4.891122278056951, "step": 5840}, {"loss": 1.3795, "grad_norm": 0.8811169862747192, "learning_rate": 0.0002, "epoch": 4.899497487437186, "step": 5850}, {"loss": 1.3827, "grad_norm": 0.84914630651474, "learning_rate": 0.0002, "epoch": 4.907872696817421, "step": 5860}, {"loss": 1.4549, "grad_norm": 0.7514461874961853, "learning_rate": 0.0002, "epoch": 4.916247906197655, "step": 5870}, {"loss": 1.3633, "grad_norm": 0.7229002118110657, "learning_rate": 0.0002, "epoch": 4.924623115577889, "step": 5880}, {"loss": 1.4302, "grad_norm": 0.9418245553970337, "learning_rate": 0.0002, "epoch": 4.932998324958124, "step": 5890}, {"loss": 1.4747, "grad_norm": 0.7626827359199524, "learning_rate": 0.0002, "epoch": 4.941373534338359, "step": 5900}, {"loss": 1.4462, "grad_norm": 0.7711105346679688, "learning_rate": 0.0002, "epoch": 4.949748743718593, "step": 5910}, {"loss": 1.4104, "grad_norm": 0.8689648509025574, "learning_rate": 0.0002, "epoch": 4.958123953098827, "step": 5920}, {"loss": 1.4273, "grad_norm": 0.7873271107673645, "learning_rate": 0.0002, "epoch": 4.966499162479062, "step": 5930}, {"loss": 1.4361, "grad_norm": 0.7637495994567871, "learning_rate": 0.0002, "epoch": 4.974874371859297, "step": 5940}, {"loss": 1.5037, "grad_norm": 0.9907955527305603, "learning_rate": 0.0002, "epoch": 4.983249581239531, "step": 5950}, {"loss": 1.4476, "grad_norm": 0.7827328443527222, "learning_rate": 0.0002, "epoch": 4.991624790619765, "step": 5960}, {"loss": 1.4252, "grad_norm": 0.818544328212738, "learning_rate": 0.0002, "epoch": 5.0, "step": 5970}, {"eval_loss": 1.9436752796173096, "eval_runtime": 38.087, "eval_samples_per_second": 13.522, "eval_steps_per_second": 1.707, "epoch": 5.0, "step": 5970}, {"loss": 1.2367, "grad_norm": 1.1248953342437744, "learning_rate": 0.0002, "epoch": 5.008375209380235, "step": 5980}, {"loss": 1.2221, "grad_norm": 0.9285888075828552, "learning_rate": 0.0002, "epoch": 5.016750418760469, "step": 5990}, {"loss": 1.263, "grad_norm": 0.8626338839530945, "learning_rate": 0.0002, "epoch": 5.025125628140704, "step": 6000}, {"loss": 1.1839, "grad_norm": 0.8253921270370483, "learning_rate": 0.0002, "epoch": 5.033500837520938, "step": 6010}, {"loss": 1.2773, "grad_norm": 1.079628586769104, "learning_rate": 0.0002, "epoch": 5.041876046901173, "step": 6020}, {"loss": 1.2419, "grad_norm": 0.902625322341919, "learning_rate": 0.0002, "epoch": 5.050251256281407, "step": 6030}, {"loss": 1.164, "grad_norm": 0.9593151211738586, "learning_rate": 0.0002, "epoch": 5.058626465661642, "step": 6040}, {"loss": 1.2442, "grad_norm": 0.9276060461997986, "learning_rate": 0.0002, "epoch": 5.067001675041876, "step": 6050}, {"loss": 1.2496, "grad_norm": 1.0472362041473389, "learning_rate": 0.0002, "epoch": 5.075376884422111, "step": 6060}, {"loss": 1.2241, "grad_norm": 0.9126865863800049, "learning_rate": 0.0002, "epoch": 5.083752093802345, "step": 6070}, {"loss": 1.1997, "grad_norm": 1.0797888040542603, "learning_rate": 0.0002, "epoch": 5.09212730318258, "step": 6080}, {"loss": 1.2299, "grad_norm": 0.9538877010345459, "learning_rate": 0.0002, "epoch": 5.100502512562814, "step": 6090}, {"loss": 1.2585, "grad_norm": 1.0604161024093628, "learning_rate": 0.0002, "epoch": 5.108877721943049, "step": 6100}, {"loss": 1.2627, "grad_norm": 1.0178192853927612, "learning_rate": 0.0002, "epoch": 5.117252931323283, "step": 6110}, {"loss": 1.2848, "grad_norm": 1.0262689590454102, "learning_rate": 0.0002, "epoch": 5.125628140703517, "step": 6120}, {"loss": 1.228, "grad_norm": 0.9046729803085327, "learning_rate": 0.0002, "epoch": 5.134003350083752, "step": 6130}, {"loss": 1.2051, "grad_norm": 1.1244608163833618, "learning_rate": 0.0002, "epoch": 5.142378559463987, "step": 6140}, {"loss": 1.2751, "grad_norm": 1.082835078239441, "learning_rate": 0.0002, "epoch": 5.150753768844221, "step": 6150}, {"loss": 1.1625, "grad_norm": 0.9078734517097473, "learning_rate": 0.0002, "epoch": 5.159128978224456, "step": 6160}, {"loss": 1.2122, "grad_norm": 1.0688848495483398, "learning_rate": 0.0002, "epoch": 5.16750418760469, "step": 6170}, {"loss": 1.2143, "grad_norm": 1.137519359588623, "learning_rate": 0.0002, "epoch": 5.175879396984925, "step": 6180}, {"loss": 1.3125, "grad_norm": 1.0728670358657837, "learning_rate": 0.0002, "epoch": 5.184254606365159, "step": 6190}, {"loss": 1.2352, "grad_norm": 1.2384949922561646, "learning_rate": 0.0002, "epoch": 5.192629815745394, "step": 6200}, {"loss": 1.2173, "grad_norm": 0.8391274809837341, "learning_rate": 0.0002, "epoch": 5.201005025125628, "step": 6210}, {"loss": 1.2179, "grad_norm": 0.8948764801025391, "learning_rate": 0.0002, "epoch": 5.209380234505863, "step": 6220}, {"loss": 1.2467, "grad_norm": 0.9568309783935547, "learning_rate": 0.0002, "epoch": 5.217755443886097, "step": 6230}, {"loss": 1.2761, "grad_norm": 1.0604485273361206, "learning_rate": 0.0002, "epoch": 5.226130653266332, "step": 6240}, {"loss": 1.1407, "grad_norm": 1.1278935670852661, "learning_rate": 0.0002, "epoch": 5.234505862646566, "step": 6250}, {"loss": 1.2332, "grad_norm": 0.9903607368469238, "learning_rate": 0.0002, "epoch": 5.242881072026801, "step": 6260}, {"loss": 1.2544, "grad_norm": 0.958718478679657, "learning_rate": 0.0002, "epoch": 5.251256281407035, "step": 6270}, {"loss": 1.2746, "grad_norm": 1.127510905265808, "learning_rate": 0.0002, "epoch": 5.259631490787269, "step": 6280}, {"loss": 1.2589, "grad_norm": 1.1683127880096436, "learning_rate": 0.0002, "epoch": 5.268006700167504, "step": 6290}, {"loss": 1.2959, "grad_norm": 1.0723326206207275, "learning_rate": 0.0002, "epoch": 5.276381909547739, "step": 6300}, {"loss": 1.2522, "grad_norm": 0.9285374283790588, "learning_rate": 0.0002, "epoch": 5.284757118927973, "step": 6310}, {"loss": 1.2539, "grad_norm": 0.9201741218566895, "learning_rate": 0.0002, "epoch": 5.293132328308207, "step": 6320}, {"loss": 1.1816, "grad_norm": 0.9606702923774719, "learning_rate": 0.0002, "epoch": 5.301507537688442, "step": 6330}, {"loss": 1.2928, "grad_norm": 1.107960820198059, "learning_rate": 0.0002, "epoch": 5.309882747068677, "step": 6340}, {"loss": 1.209, "grad_norm": 0.9342933297157288, "learning_rate": 0.0002, "epoch": 5.318257956448911, "step": 6350}, {"loss": 1.2023, "grad_norm": 0.9170576930046082, "learning_rate": 0.0002, "epoch": 5.326633165829146, "step": 6360}, {"loss": 1.2239, "grad_norm": 0.7612091898918152, "learning_rate": 0.0002, "epoch": 5.33500837520938, "step": 6370}, {"loss": 1.2176, "grad_norm": 1.2524093389511108, "learning_rate": 0.0002, "epoch": 5.343383584589615, "step": 6380}, {"loss": 1.219, "grad_norm": 0.8481650352478027, "learning_rate": 0.0002, "epoch": 5.351758793969849, "step": 6390}, {"loss": 1.237, "grad_norm": 1.0562204122543335, "learning_rate": 0.0002, "epoch": 5.360134003350084, "step": 6400}, {"loss": 1.1844, "grad_norm": 0.96522456407547, "learning_rate": 0.0002, "epoch": 5.368509212730318, "step": 6410}, {"loss": 1.2465, "grad_norm": 0.9680143594741821, "learning_rate": 0.0002, "epoch": 5.376884422110553, "step": 6420}, {"loss": 1.2809, "grad_norm": 0.9743781685829163, "learning_rate": 0.0002, "epoch": 5.385259631490787, "step": 6430}, {"loss": 1.2637, "grad_norm": 0.8907374143600464, "learning_rate": 0.0002, "epoch": 5.393634840871022, "step": 6440}, {"loss": 1.2174, "grad_norm": 1.3755217790603638, "learning_rate": 0.0002, "epoch": 5.402010050251256, "step": 6450}, {"loss": 1.224, "grad_norm": 1.1926233768463135, "learning_rate": 0.0002, "epoch": 5.410385259631491, "step": 6460}, {"loss": 1.1685, "grad_norm": 0.8343448638916016, "learning_rate": 0.0002, "epoch": 5.418760469011725, "step": 6470}, {"loss": 1.232, "grad_norm": 1.0056027173995972, "learning_rate": 0.0002, "epoch": 5.42713567839196, "step": 6480}, {"loss": 1.2936, "grad_norm": 0.9482131600379944, "learning_rate": 0.0002, "epoch": 5.435510887772194, "step": 6490}, {"loss": 1.3084, "grad_norm": 0.9766585826873779, "learning_rate": 0.0002, "epoch": 5.443886097152429, "step": 6500}, {"loss": 1.2758, "grad_norm": 0.9226584434509277, "learning_rate": 0.0002, "epoch": 5.452261306532663, "step": 6510}, {"loss": 1.328, "grad_norm": 0.9605025053024292, "learning_rate": 0.0002, "epoch": 5.460636515912898, "step": 6520}, {"loss": 1.3285, "grad_norm": 1.0022773742675781, "learning_rate": 0.0002, "epoch": 5.469011725293132, "step": 6530}, {"loss": 1.3126, "grad_norm": 1.056764841079712, "learning_rate": 0.0002, "epoch": 5.477386934673367, "step": 6540}, {"loss": 1.3018, "grad_norm": 0.9648325443267822, "learning_rate": 0.0002, "epoch": 5.485762144053601, "step": 6550}, {"loss": 1.2633, "grad_norm": 0.8987206816673279, "learning_rate": 0.0002, "epoch": 5.494137353433836, "step": 6560}, {"loss": 1.2356, "grad_norm": 1.1946845054626465, "learning_rate": 0.0002, "epoch": 5.50251256281407, "step": 6570}, {"loss": 1.2613, "grad_norm": 1.037416696548462, "learning_rate": 0.0002, "epoch": 5.510887772194305, "step": 6580}, {"loss": 1.2873, "grad_norm": 1.085598349571228, "learning_rate": 0.0002, "epoch": 5.519262981574539, "step": 6590}, {"loss": 1.2562, "grad_norm": 0.9253745079040527, "learning_rate": 0.0002, "epoch": 5.527638190954773, "step": 6600}, {"loss": 1.3037, "grad_norm": 1.0624418258666992, "learning_rate": 0.0002, "epoch": 5.536013400335008, "step": 6610}, {"loss": 1.2523, "grad_norm": 1.002821922302246, "learning_rate": 0.0002, "epoch": 5.544388609715243, "step": 6620}, {"loss": 1.2662, "grad_norm": 0.9343662858009338, "learning_rate": 0.0002, "epoch": 5.552763819095477, "step": 6630}, {"loss": 1.2467, "grad_norm": 0.9129965305328369, "learning_rate": 0.0002, "epoch": 5.561139028475711, "step": 6640}, {"loss": 1.2931, "grad_norm": 1.220263957977295, "learning_rate": 0.0002, "epoch": 5.569514237855946, "step": 6650}, {"loss": 1.2638, "grad_norm": 0.9705421924591064, "learning_rate": 0.0002, "epoch": 5.577889447236181, "step": 6660}, {"loss": 1.2815, "grad_norm": 0.8417587876319885, "learning_rate": 0.0002, "epoch": 5.586264656616415, "step": 6670}, {"loss": 1.3616, "grad_norm": 0.9351304769515991, "learning_rate": 0.0002, "epoch": 5.594639865996649, "step": 6680}, {"loss": 1.2795, "grad_norm": 1.012598991394043, "learning_rate": 0.0002, "epoch": 5.603015075376884, "step": 6690}, {"loss": 1.2457, "grad_norm": 1.018328309059143, "learning_rate": 0.0002, "epoch": 5.611390284757119, "step": 6700}, {"loss": 1.3084, "grad_norm": 0.9289278388023376, "learning_rate": 0.0002, "epoch": 5.619765494137353, "step": 6710}, {"loss": 1.2645, "grad_norm": 0.8390841484069824, "learning_rate": 0.0002, "epoch": 5.628140703517588, "step": 6720}, {"loss": 1.2676, "grad_norm": 0.9989390969276428, "learning_rate": 0.0002, "epoch": 5.636515912897822, "step": 6730}, {"loss": 1.2937, "grad_norm": 1.0675761699676514, "learning_rate": 0.0002, "epoch": 5.644891122278057, "step": 6740}, {"loss": 1.2599, "grad_norm": 1.0649791955947876, "learning_rate": 0.0002, "epoch": 5.653266331658291, "step": 6750}, {"loss": 1.2191, "grad_norm": 0.8542222380638123, "learning_rate": 0.0002, "epoch": 5.661641541038526, "step": 6760}, {"loss": 1.2336, "grad_norm": 0.9148173928260803, "learning_rate": 0.0002, "epoch": 5.67001675041876, "step": 6770}, {"loss": 1.3286, "grad_norm": 0.978024423122406, "learning_rate": 0.0002, "epoch": 5.678391959798995, "step": 6780}, {"loss": 1.2821, "grad_norm": 1.0385138988494873, "learning_rate": 0.0002, "epoch": 5.686767169179229, "step": 6790}, {"loss": 1.218, "grad_norm": 0.9687889218330383, "learning_rate": 0.0002, "epoch": 5.695142378559464, "step": 6800}, {"loss": 1.3256, "grad_norm": 0.862335205078125, "learning_rate": 0.0002, "epoch": 5.703517587939698, "step": 6810}, {"loss": 1.2783, "grad_norm": 0.9729578495025635, "learning_rate": 0.0002, "epoch": 5.711892797319933, "step": 6820}, {"loss": 1.3318, "grad_norm": 0.8936806321144104, "learning_rate": 0.0002, "epoch": 5.720268006700167, "step": 6830}, {"loss": 1.27, "grad_norm": 0.9222455620765686, "learning_rate": 0.0002, "epoch": 5.728643216080402, "step": 6840}, {"loss": 1.2097, "grad_norm": 1.0584437847137451, "learning_rate": 0.0002, "epoch": 5.7370184254606365, "step": 6850}, {"loss": 1.2308, "grad_norm": 0.9114518165588379, "learning_rate": 0.0002, "epoch": 5.745393634840871, "step": 6860}, {"loss": 1.2767, "grad_norm": 0.9590078592300415, "learning_rate": 0.0002, "epoch": 5.7537688442211055, "step": 6870}, {"loss": 1.2639, "grad_norm": 0.9056822061538696, "learning_rate": 0.0002, "epoch": 5.76214405360134, "step": 6880}, {"loss": 1.3257, "grad_norm": 1.0069063901901245, "learning_rate": 0.0002, "epoch": 5.7705192629815745, "step": 6890}, {"loss": 1.3382, "grad_norm": 0.9810041189193726, "learning_rate": 0.0002, "epoch": 5.778894472361809, "step": 6900}, {"loss": 1.2907, "grad_norm": 0.881629228591919, "learning_rate": 0.0002, "epoch": 5.7872696817420435, "step": 6910}, {"loss": 1.3122, "grad_norm": 1.1020095348358154, "learning_rate": 0.0002, "epoch": 5.795644891122278, "step": 6920}, {"loss": 1.2985, "grad_norm": 0.8774619102478027, "learning_rate": 0.0002, "epoch": 5.8040201005025125, "step": 6930}, {"loss": 1.311, "grad_norm": 0.9321739673614502, "learning_rate": 0.0002, "epoch": 5.812395309882747, "step": 6940}, {"loss": 1.2951, "grad_norm": 0.9082857966423035, "learning_rate": 0.0002, "epoch": 5.8207705192629815, "step": 6950}, {"loss": 1.2582, "grad_norm": 0.9119554758071899, "learning_rate": 0.0002, "epoch": 5.8291457286432165, "step": 6960}, {"loss": 1.2777, "grad_norm": 1.0643284320831299, "learning_rate": 0.0002, "epoch": 5.8375209380234505, "step": 6970}, {"loss": 1.3319, "grad_norm": 0.8526089787483215, "learning_rate": 0.0002, "epoch": 5.8458961474036855, "step": 6980}, {"loss": 1.2539, "grad_norm": 0.930439829826355, "learning_rate": 0.0002, "epoch": 5.8542713567839195, "step": 6990}, {"loss": 1.3059, "grad_norm": 1.0461677312850952, "learning_rate": 0.0002, "epoch": 5.8626465661641545, "step": 7000}, {"loss": 1.2623, "grad_norm": 0.92561936378479, "learning_rate": 0.0002, "epoch": 5.8710217755443885, "step": 7010}, {"loss": 1.2354, "grad_norm": 0.8936395049095154, "learning_rate": 0.0002, "epoch": 5.8793969849246235, "step": 7020}, {"loss": 1.3232, "grad_norm": 0.986539363861084, "learning_rate": 0.0002, "epoch": 5.8877721943048575, "step": 7030}, {"loss": 1.2399, "grad_norm": 0.8776476383209229, "learning_rate": 0.0002, "epoch": 5.8961474036850925, "step": 7040}, {"loss": 1.2374, "grad_norm": 1.0256905555725098, "learning_rate": 0.0002, "epoch": 5.9045226130653266, "step": 7050}, {"loss": 1.3049, "grad_norm": 0.96241295337677, "learning_rate": 0.0002, "epoch": 5.9128978224455615, "step": 7060}, {"loss": 1.2349, "grad_norm": 1.0251280069351196, "learning_rate": 0.0002, "epoch": 5.921273031825796, "step": 7070}, {"loss": 1.2225, "grad_norm": 1.0794076919555664, "learning_rate": 0.0002, "epoch": 5.9296482412060305, "step": 7080}, {"loss": 1.2978, "grad_norm": 0.9852448105812073, "learning_rate": 0.0002, "epoch": 5.938023450586265, "step": 7090}, {"loss": 1.3278, "grad_norm": 1.1678671836853027, "learning_rate": 0.0002, "epoch": 5.9463986599664995, "step": 7100}, {"loss": 1.2908, "grad_norm": 0.9818310141563416, "learning_rate": 0.0002, "epoch": 5.954773869346734, "step": 7110}, {"loss": 1.3406, "grad_norm": 1.0732046365737915, "learning_rate": 0.0002, "epoch": 5.9631490787269685, "step": 7120}, {"loss": 1.2402, "grad_norm": 0.912470281124115, "learning_rate": 0.0002, "epoch": 5.971524288107203, "step": 7130}, {"loss": 1.2979, "grad_norm": 1.0944788455963135, "learning_rate": 0.0002, "epoch": 5.9798994974874375, "step": 7140}, {"loss": 1.3249, "grad_norm": 1.0393965244293213, "learning_rate": 0.0002, "epoch": 5.988274706867672, "step": 7150}, {"loss": 1.2913, "grad_norm": 0.8758739233016968, "learning_rate": 0.0002, "epoch": 5.9966499162479066, "step": 7160}, {"eval_loss": 2.0526134967803955, "eval_runtime": 37.9699, "eval_samples_per_second": 13.563, "eval_steps_per_second": 1.712, "epoch": 6.0, "step": 7164}, {"loss": 1.1352, "grad_norm": 1.138184666633606, "learning_rate": 0.0002, "epoch": 6.005025125628141, "step": 7170}, {"loss": 1.0727, "grad_norm": 0.9295315742492676, "learning_rate": 0.0002, "epoch": 6.013400335008376, "step": 7180}, {"loss": 1.0859, "grad_norm": 1.1252633333206177, "learning_rate": 0.0002, "epoch": 6.02177554438861, "step": 7190}, {"loss": 1.0827, "grad_norm": 1.0611635446548462, "learning_rate": 0.0002, "epoch": 6.030150753768845, "step": 7200}, {"loss": 1.0756, "grad_norm": 1.022278070449829, "learning_rate": 0.0002, "epoch": 6.038525963149079, "step": 7210}, {"loss": 1.0616, "grad_norm": 1.0280728340148926, "learning_rate": 0.0002, "epoch": 6.046901172529314, "step": 7220}, {"loss": 1.0237, "grad_norm": 0.9516313076019287, "learning_rate": 0.0002, "epoch": 6.055276381909548, "step": 7230}, {"loss": 1.0388, "grad_norm": 1.0925321578979492, "learning_rate": 0.0002, "epoch": 6.063651591289783, "step": 7240}, {"loss": 1.113, "grad_norm": 0.9885565042495728, "learning_rate": 0.0002, "epoch": 6.072026800670017, "step": 7250}, {"loss": 1.1167, "grad_norm": 1.0905766487121582, "learning_rate": 0.0002, "epoch": 6.080402010050252, "step": 7260}, {"loss": 1.0775, "grad_norm": 1.075183391571045, "learning_rate": 0.0002, "epoch": 6.088777219430486, "step": 7270}, {"loss": 1.1371, "grad_norm": 1.0897727012634277, "learning_rate": 0.0002, "epoch": 6.097152428810721, "step": 7280}, {"loss": 1.0335, "grad_norm": 1.3677806854248047, "learning_rate": 0.0002, "epoch": 6.105527638190955, "step": 7290}, {"loss": 1.0566, "grad_norm": 1.1880329847335815, "learning_rate": 0.0002, "epoch": 6.11390284757119, "step": 7300}, {"loss": 1.061, "grad_norm": 1.036330223083496, "learning_rate": 0.0002, "epoch": 6.122278056951424, "step": 7310}, {"loss": 1.0621, "grad_norm": 1.2165348529815674, "learning_rate": 0.0002, "epoch": 6.130653266331659, "step": 7320}, {"loss": 1.0796, "grad_norm": 1.027368187904358, "learning_rate": 0.0002, "epoch": 6.139028475711893, "step": 7330}, {"loss": 1.0994, "grad_norm": 1.2497830390930176, "learning_rate": 0.0002, "epoch": 6.147403685092128, "step": 7340}, {"loss": 1.1616, "grad_norm": 1.166595458984375, "learning_rate": 0.0002, "epoch": 6.155778894472362, "step": 7350}, {"loss": 1.1301, "grad_norm": 1.1143730878829956, "learning_rate": 0.0002, "epoch": 6.164154103852597, "step": 7360}, {"loss": 1.0913, "grad_norm": 1.1531223058700562, "learning_rate": 0.0002, "epoch": 6.172529313232831, "step": 7370}, {"loss": 1.0819, "grad_norm": 1.176507830619812, "learning_rate": 0.0002, "epoch": 6.180904522613066, "step": 7380}, {"loss": 1.0375, "grad_norm": 1.3174604177474976, "learning_rate": 0.0002, "epoch": 6.1892797319933, "step": 7390}, {"loss": 1.1586, "grad_norm": 1.0284459590911865, "learning_rate": 0.0002, "epoch": 6.197654941373535, "step": 7400}, {"loss": 1.1044, "grad_norm": 1.0801599025726318, "learning_rate": 0.0002, "epoch": 6.206030150753769, "step": 7410}, {"loss": 1.1441, "grad_norm": 1.200514554977417, "learning_rate": 0.0002, "epoch": 6.214405360134004, "step": 7420}, {"loss": 1.0234, "grad_norm": 1.0148060321807861, "learning_rate": 0.0002, "epoch": 6.222780569514238, "step": 7430}, {"loss": 1.0616, "grad_norm": 1.2368836402893066, "learning_rate": 0.0002, "epoch": 6.231155778894473, "step": 7440}, {"loss": 1.0781, "grad_norm": 1.228834629058838, "learning_rate": 0.0002, "epoch": 6.239530988274707, "step": 7450}, {"loss": 1.1128, "grad_norm": 1.1588891744613647, "learning_rate": 0.0002, "epoch": 6.247906197654942, "step": 7460}, {"loss": 1.0807, "grad_norm": 1.3500380516052246, "learning_rate": 0.0002, "epoch": 6.256281407035176, "step": 7470}, {"loss": 1.1057, "grad_norm": 1.1429533958435059, "learning_rate": 0.0002, "epoch": 6.264656616415411, "step": 7480}, {"loss": 1.1519, "grad_norm": 1.2314441204071045, "learning_rate": 0.0002, "epoch": 6.273031825795645, "step": 7490}, {"loss": 1.0885, "grad_norm": 1.0917996168136597, "learning_rate": 0.0002, "epoch": 6.28140703517588, "step": 7500}, {"loss": 1.0786, "grad_norm": 1.3294450044631958, "learning_rate": 0.0002, "epoch": 6.289782244556114, "step": 7510}, {"loss": 1.1187, "grad_norm": 1.1035195589065552, "learning_rate": 0.0002, "epoch": 6.298157453936349, "step": 7520}, {"loss": 1.1183, "grad_norm": 1.2643269300460815, "learning_rate": 0.0002, "epoch": 6.306532663316583, "step": 7530}, {"loss": 1.0767, "grad_norm": 1.2226417064666748, "learning_rate": 0.0002, "epoch": 6.314907872696818, "step": 7540}, {"loss": 1.1335, "grad_norm": 1.0248615741729736, "learning_rate": 0.0002, "epoch": 6.323283082077052, "step": 7550}, {"loss": 1.0856, "grad_norm": 1.28317129611969, "learning_rate": 0.0002, "epoch": 6.331658291457287, "step": 7560}, {"loss": 1.166, "grad_norm": 1.1461660861968994, "learning_rate": 0.0002, "epoch": 6.340033500837521, "step": 7570}, {"loss": 1.1627, "grad_norm": 1.297136664390564, "learning_rate": 0.0002, "epoch": 6.348408710217756, "step": 7580}, {"loss": 1.1342, "grad_norm": 1.3376781940460205, "learning_rate": 0.0002, "epoch": 6.35678391959799, "step": 7590}, {"loss": 1.072, "grad_norm": 1.2507376670837402, "learning_rate": 0.0002, "epoch": 6.365159128978225, "step": 7600}, {"loss": 1.0731, "grad_norm": 1.3255126476287842, "learning_rate": 0.0002, "epoch": 6.373534338358459, "step": 7610}, {"loss": 1.0818, "grad_norm": 1.1082066297531128, "learning_rate": 0.0002, "epoch": 6.381909547738694, "step": 7620}, {"loss": 1.0894, "grad_norm": 1.4461497068405151, "learning_rate": 0.0002, "epoch": 6.390284757118928, "step": 7630}, {"loss": 1.1443, "grad_norm": 1.2875033617019653, "learning_rate": 0.0002, "epoch": 6.398659966499163, "step": 7640}, {"loss": 1.1027, "grad_norm": 1.1017295122146606, "learning_rate": 0.0002, "epoch": 6.407035175879397, "step": 7650}, {"loss": 1.1046, "grad_norm": 1.1896536350250244, "learning_rate": 0.0002, "epoch": 6.415410385259632, "step": 7660}, {"loss": 1.1207, "grad_norm": 1.0939011573791504, "learning_rate": 0.0002, "epoch": 6.423785594639866, "step": 7670}, {"loss": 1.1338, "grad_norm": 1.2593132257461548, "learning_rate": 0.0002, "epoch": 6.432160804020101, "step": 7680}, {"loss": 1.071, "grad_norm": 1.1151225566864014, "learning_rate": 0.0002, "epoch": 6.440536013400335, "step": 7690}, {"loss": 1.1832, "grad_norm": 1.0686280727386475, "learning_rate": 0.0002, "epoch": 6.44891122278057, "step": 7700}, {"loss": 1.1611, "grad_norm": 1.4008738994598389, "learning_rate": 0.0002, "epoch": 6.457286432160804, "step": 7710}, {"loss": 1.1191, "grad_norm": 1.1698687076568604, "learning_rate": 0.0002, "epoch": 6.465661641541039, "step": 7720}, {"loss": 1.1637, "grad_norm": 1.1306401491165161, "learning_rate": 0.0002, "epoch": 6.474036850921273, "step": 7730}, {"loss": 1.1534, "grad_norm": 1.2970236539840698, "learning_rate": 0.0002, "epoch": 6.482412060301508, "step": 7740}, {"loss": 1.1408, "grad_norm": 1.1515544652938843, "learning_rate": 0.0002, "epoch": 6.490787269681742, "step": 7750}, {"loss": 1.098, "grad_norm": 1.13273024559021, "learning_rate": 0.0002, "epoch": 6.499162479061977, "step": 7760}, {"loss": 1.1356, "grad_norm": 1.1635724306106567, "learning_rate": 0.0002, "epoch": 6.507537688442211, "step": 7770}, {"loss": 1.0849, "grad_norm": 1.1620264053344727, "learning_rate": 0.0002, "epoch": 6.515912897822446, "step": 7780}, {"loss": 1.1786, "grad_norm": 1.159905195236206, "learning_rate": 0.0002, "epoch": 6.52428810720268, "step": 7790}, {"loss": 1.1252, "grad_norm": 1.2243341207504272, "learning_rate": 0.0002, "epoch": 6.532663316582915, "step": 7800}, {"loss": 1.1654, "grad_norm": 1.1034481525421143, "learning_rate": 0.0002, "epoch": 6.541038525963149, "step": 7810}, {"loss": 1.1579, "grad_norm": 1.1131408214569092, "learning_rate": 0.0002, "epoch": 6.549413735343384, "step": 7820}, {"loss": 1.1053, "grad_norm": 1.211260199546814, "learning_rate": 0.0002, "epoch": 6.557788944723618, "step": 7830}, {"loss": 1.1178, "grad_norm": 1.408692717552185, "learning_rate": 0.0002, "epoch": 6.566164154103853, "step": 7840}, {"loss": 1.1586, "grad_norm": 1.151441216468811, "learning_rate": 0.0002, "epoch": 6.574539363484087, "step": 7850}, {"loss": 1.1754, "grad_norm": 1.1160012483596802, "learning_rate": 0.0002, "epoch": 6.582914572864322, "step": 7860}, {"loss": 1.1092, "grad_norm": 1.2496052980422974, "learning_rate": 0.0002, "epoch": 6.591289782244556, "step": 7870}, {"loss": 1.2007, "grad_norm": 1.559907078742981, "learning_rate": 0.0002, "epoch": 6.599664991624791, "step": 7880}, {"loss": 1.1482, "grad_norm": 1.4399309158325195, "learning_rate": 0.0002, "epoch": 6.608040201005025, "step": 7890}, {"loss": 1.1801, "grad_norm": 1.155007243156433, "learning_rate": 0.0002, "epoch": 6.61641541038526, "step": 7900}, {"loss": 1.2029, "grad_norm": 1.4339076280593872, "learning_rate": 0.0002, "epoch": 6.624790619765494, "step": 7910}, {"loss": 1.1594, "grad_norm": 1.2093058824539185, "learning_rate": 0.0002, "epoch": 6.633165829145729, "step": 7920}, {"loss": 1.185, "grad_norm": 1.1619434356689453, "learning_rate": 0.0002, "epoch": 6.641541038525963, "step": 7930}, {"loss": 1.1369, "grad_norm": 1.2879594564437866, "learning_rate": 0.0002, "epoch": 6.649916247906198, "step": 7940}, {"loss": 1.1992, "grad_norm": 1.0598394870758057, "learning_rate": 0.0002, "epoch": 6.658291457286432, "step": 7950}, {"loss": 1.1337, "grad_norm": 1.0937503576278687, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 7960}, {"loss": 1.1137, "grad_norm": 1.2670115232467651, "learning_rate": 0.0002, "epoch": 6.675041876046901, "step": 7970}, {"loss": 1.1711, "grad_norm": 1.2351782321929932, "learning_rate": 0.0002, "epoch": 6.683417085427136, "step": 7980}, {"loss": 1.1774, "grad_norm": 1.344128131866455, "learning_rate": 0.0002, "epoch": 6.69179229480737, "step": 7990}, {"loss": 1.1739, "grad_norm": 1.2894740104675293, "learning_rate": 0.0002, "epoch": 6.700167504187605, "step": 8000}, {"loss": 1.1045, "grad_norm": 1.1804684400558472, "learning_rate": 0.0002, "epoch": 6.708542713567839, "step": 8010}, {"loss": 1.2371, "grad_norm": 1.314237356185913, "learning_rate": 0.0002, "epoch": 6.716917922948074, "step": 8020}, {"loss": 1.1113, "grad_norm": 1.2132530212402344, "learning_rate": 0.0002, "epoch": 6.725293132328308, "step": 8030}, {"loss": 1.1467, "grad_norm": 0.999580979347229, "learning_rate": 0.0002, "epoch": 6.733668341708543, "step": 8040}, {"loss": 1.1418, "grad_norm": 1.206323266029358, "learning_rate": 0.0002, "epoch": 6.742043551088777, "step": 8050}, {"loss": 1.1265, "grad_norm": 1.1092344522476196, "learning_rate": 0.0002, "epoch": 6.750418760469012, "step": 8060}, {"loss": 1.1583, "grad_norm": 1.0168755054473877, "learning_rate": 0.0002, "epoch": 6.758793969849246, "step": 8070}, {"loss": 1.189, "grad_norm": 1.2310614585876465, "learning_rate": 0.0002, "epoch": 6.767169179229481, "step": 8080}, {"loss": 1.1775, "grad_norm": 1.1587172746658325, "learning_rate": 0.0002, "epoch": 6.775544388609715, "step": 8090}, {"loss": 1.1761, "grad_norm": 1.1362504959106445, "learning_rate": 0.0002, "epoch": 6.78391959798995, "step": 8100}, {"loss": 1.1521, "grad_norm": 1.3735119104385376, "learning_rate": 0.0002, "epoch": 6.792294807370184, "step": 8110}, {"loss": 1.1214, "grad_norm": 1.1804813146591187, "learning_rate": 0.0002, "epoch": 6.800670016750419, "step": 8120}, {"loss": 1.1035, "grad_norm": 1.1849592924118042, "learning_rate": 0.0002, "epoch": 6.809045226130653, "step": 8130}, {"loss": 1.1622, "grad_norm": 1.1638602018356323, "learning_rate": 0.0002, "epoch": 6.817420435510888, "step": 8140}, {"loss": 1.1178, "grad_norm": 1.2106250524520874, "learning_rate": 0.0002, "epoch": 6.825795644891122, "step": 8150}, {"loss": 1.2231, "grad_norm": 1.276068091392517, "learning_rate": 0.0002, "epoch": 6.834170854271357, "step": 8160}, {"loss": 1.1309, "grad_norm": 1.4283488988876343, "learning_rate": 0.0002, "epoch": 6.842546063651591, "step": 8170}, {"loss": 1.1494, "grad_norm": 1.4286448955535889, "learning_rate": 0.0002, "epoch": 6.850921273031826, "step": 8180}, {"loss": 1.185, "grad_norm": 1.191275715827942, "learning_rate": 0.0002, "epoch": 6.85929648241206, "step": 8190}, {"loss": 1.1984, "grad_norm": 1.4232908487319946, "learning_rate": 0.0002, "epoch": 6.867671691792295, "step": 8200}, {"loss": 1.182, "grad_norm": 1.2166317701339722, "learning_rate": 0.0002, "epoch": 6.876046901172529, "step": 8210}, {"loss": 1.1311, "grad_norm": 1.0487027168273926, "learning_rate": 0.0002, "epoch": 6.884422110552764, "step": 8220}, {"loss": 1.1973, "grad_norm": 1.247178077697754, "learning_rate": 0.0002, "epoch": 6.892797319932998, "step": 8230}, {"loss": 1.0942, "grad_norm": 1.0728635787963867, "learning_rate": 0.0002, "epoch": 6.901172529313233, "step": 8240}, {"loss": 1.2106, "grad_norm": 1.1909451484680176, "learning_rate": 0.0002, "epoch": 6.909547738693467, "step": 8250}, {"loss": 1.1336, "grad_norm": 1.337556004524231, "learning_rate": 0.0002, "epoch": 6.917922948073702, "step": 8260}, {"loss": 1.2295, "grad_norm": 1.1479394435882568, "learning_rate": 0.0002, "epoch": 6.926298157453936, "step": 8270}, {"loss": 1.1497, "grad_norm": 1.2038872241973877, "learning_rate": 0.0002, "epoch": 6.934673366834171, "step": 8280}, {"loss": 1.1806, "grad_norm": 1.088813066482544, "learning_rate": 0.0002, "epoch": 6.943048576214405, "step": 8290}, {"loss": 1.181, "grad_norm": 1.0153290033340454, "learning_rate": 0.0002, "epoch": 6.95142378559464, "step": 8300}, {"loss": 1.1846, "grad_norm": 1.2159703969955444, "learning_rate": 0.0002, "epoch": 6.959798994974874, "step": 8310}, {"loss": 1.1029, "grad_norm": 1.0844143629074097, "learning_rate": 0.0002, "epoch": 6.968174204355109, "step": 8320}, {"loss": 1.1843, "grad_norm": 1.1617385149002075, "learning_rate": 0.0002, "epoch": 6.976549413735343, "step": 8330}, {"loss": 1.177, "grad_norm": 1.126503586769104, "learning_rate": 0.0002, "epoch": 6.984924623115578, "step": 8340}, {"loss": 1.1753, "grad_norm": 1.1553548574447632, "learning_rate": 0.0002, "epoch": 6.993299832495812, "step": 8350}, {"eval_loss": 2.1463968753814697, "eval_runtime": 37.9219, "eval_samples_per_second": 13.581, "eval_steps_per_second": 1.714, "epoch": 7.0, "step": 8358}, {"loss": 1.1205, "grad_norm": 1.0229777097702026, "learning_rate": 0.0002, "epoch": 7.001675041876047, "step": 8360}, {"loss": 0.9556, "grad_norm": 1.2346612215042114, "learning_rate": 0.0002, "epoch": 7.010050251256281, "step": 8370}, {"loss": 0.9406, "grad_norm": 1.2478288412094116, "learning_rate": 0.0002, "epoch": 7.018425460636516, "step": 8380}, {"loss": 0.9603, "grad_norm": 1.3081458806991577, "learning_rate": 0.0002, "epoch": 7.02680067001675, "step": 8390}, {"loss": 0.9594, "grad_norm": 1.508225440979004, "learning_rate": 0.0002, "epoch": 7.035175879396985, "step": 8400}, {"loss": 0.9472, "grad_norm": 1.7482528686523438, "learning_rate": 0.0002, "epoch": 7.043551088777219, "step": 8410}, {"loss": 1.0217, "grad_norm": 1.3465625047683716, "learning_rate": 0.0002, "epoch": 7.051926298157454, "step": 8420}, {"loss": 0.9683, "grad_norm": 1.3181530237197876, "learning_rate": 0.0002, "epoch": 7.060301507537688, "step": 8430}, {"loss": 0.9296, "grad_norm": 1.2666151523590088, "learning_rate": 0.0002, "epoch": 7.068676716917923, "step": 8440}, {"loss": 0.989, "grad_norm": 1.5192651748657227, "learning_rate": 0.0002, "epoch": 7.077051926298157, "step": 8450}, {"loss": 0.9281, "grad_norm": 1.3075478076934814, "learning_rate": 0.0002, "epoch": 7.085427135678392, "step": 8460}, {"loss": 0.9098, "grad_norm": 1.0856449604034424, "learning_rate": 0.0002, "epoch": 7.093802345058626, "step": 8470}, {"loss": 0.9813, "grad_norm": 1.299716830253601, "learning_rate": 0.0002, "epoch": 7.102177554438861, "step": 8480}, {"loss": 0.9572, "grad_norm": 1.4345086812973022, "learning_rate": 0.0002, "epoch": 7.110552763819095, "step": 8490}, {"loss": 0.9705, "grad_norm": 1.4502071142196655, "learning_rate": 0.0002, "epoch": 7.11892797319933, "step": 8500}, {"loss": 0.9073, "grad_norm": 1.315466284751892, "learning_rate": 0.0002, "epoch": 7.127303182579564, "step": 8510}, {"loss": 0.9635, "grad_norm": 1.2893296480178833, "learning_rate": 0.0002, "epoch": 7.135678391959799, "step": 8520}, {"loss": 0.9636, "grad_norm": 1.4431706666946411, "learning_rate": 0.0002, "epoch": 7.144053601340033, "step": 8530}, {"loss": 0.9761, "grad_norm": 1.2943761348724365, "learning_rate": 0.0002, "epoch": 7.152428810720268, "step": 8540}, {"loss": 1.0148, "grad_norm": 1.2941267490386963, "learning_rate": 0.0002, "epoch": 7.160804020100502, "step": 8550}, {"loss": 0.9336, "grad_norm": 1.1535996198654175, "learning_rate": 0.0002, "epoch": 7.169179229480737, "step": 8560}, {"loss": 0.9691, "grad_norm": 1.4487816095352173, "learning_rate": 0.0002, "epoch": 7.177554438860971, "step": 8570}, {"loss": 0.9904, "grad_norm": 1.2985659837722778, "learning_rate": 0.0002, "epoch": 7.185929648241206, "step": 8580}, {"loss": 0.9359, "grad_norm": 1.2589194774627686, "learning_rate": 0.0002, "epoch": 7.19430485762144, "step": 8590}, {"loss": 0.9239, "grad_norm": 1.327163815498352, "learning_rate": 0.0002, "epoch": 7.202680067001675, "step": 8600}, {"loss": 0.9809, "grad_norm": 1.2303123474121094, "learning_rate": 0.0002, "epoch": 7.211055276381909, "step": 8610}, {"loss": 0.967, "grad_norm": 1.5056939125061035, "learning_rate": 0.0002, "epoch": 7.219430485762144, "step": 8620}, {"loss": 0.987, "grad_norm": 1.5022825002670288, "learning_rate": 0.0002, "epoch": 7.227805695142378, "step": 8630}, {"loss": 1.0659, "grad_norm": 1.3092796802520752, "learning_rate": 0.0002, "epoch": 7.236180904522613, "step": 8640}, {"loss": 0.9434, "grad_norm": 1.2752959728240967, "learning_rate": 0.0002, "epoch": 7.244556113902847, "step": 8650}, {"loss": 0.9833, "grad_norm": 1.2906183004379272, "learning_rate": 0.0002, "epoch": 7.252931323283082, "step": 8660}, {"loss": 0.9843, "grad_norm": 1.6165488958358765, "learning_rate": 0.0002, "epoch": 7.261306532663316, "step": 8670}, {"loss": 1.0087, "grad_norm": 1.5356138944625854, "learning_rate": 0.0002, "epoch": 7.269681742043551, "step": 8680}, {"loss": 1.0101, "grad_norm": 1.4998574256896973, "learning_rate": 0.0002, "epoch": 7.278056951423785, "step": 8690}, {"loss": 0.9908, "grad_norm": 1.3943705558776855, "learning_rate": 0.0002, "epoch": 7.28643216080402, "step": 8700}, {"loss": 0.9857, "grad_norm": 1.2478622198104858, "learning_rate": 0.0002, "epoch": 7.294807370184254, "step": 8710}, {"loss": 0.9419, "grad_norm": 1.6093883514404297, "learning_rate": 0.0002, "epoch": 7.303182579564489, "step": 8720}, {"loss": 0.9502, "grad_norm": 1.2838177680969238, "learning_rate": 0.0002, "epoch": 7.311557788944723, "step": 8730}, {"loss": 1.025, "grad_norm": 1.3537026643753052, "learning_rate": 0.0002, "epoch": 7.319932998324958, "step": 8740}, {"loss": 0.9632, "grad_norm": 1.5077383518218994, "learning_rate": 0.0002, "epoch": 7.328308207705192, "step": 8750}, {"loss": 1.0158, "grad_norm": 1.4921475648880005, "learning_rate": 0.0002, "epoch": 7.336683417085427, "step": 8760}, {"loss": 0.9919, "grad_norm": 1.3573282957077026, "learning_rate": 0.0002, "epoch": 7.345058626465661, "step": 8770}, {"loss": 1.0483, "grad_norm": 1.3224730491638184, "learning_rate": 0.0002, "epoch": 7.353433835845896, "step": 8780}, {"loss": 0.9874, "grad_norm": 1.3497579097747803, "learning_rate": 0.0002, "epoch": 7.36180904522613, "step": 8790}, {"loss": 0.9853, "grad_norm": 1.1072763204574585, "learning_rate": 0.0002, "epoch": 7.370184254606365, "step": 8800}, {"loss": 1.0036, "grad_norm": 1.3373113870620728, "learning_rate": 0.0002, "epoch": 7.3785594639865995, "step": 8810}, {"loss": 0.9636, "grad_norm": 1.2301900386810303, "learning_rate": 0.0002, "epoch": 7.386934673366834, "step": 8820}, {"loss": 0.9903, "grad_norm": 1.4248781204223633, "learning_rate": 0.0002, "epoch": 7.3953098827470685, "step": 8830}, {"loss": 0.9802, "grad_norm": 1.6177928447723389, "learning_rate": 0.0002, "epoch": 7.403685092127303, "step": 8840}, {"loss": 1.0346, "grad_norm": 1.3096086978912354, "learning_rate": 0.0002, "epoch": 7.4120603015075375, "step": 8850}, {"loss": 1.0274, "grad_norm": 1.5262911319732666, "learning_rate": 0.0002, "epoch": 7.420435510887772, "step": 8860}, {"loss": 0.9894, "grad_norm": 1.7313627004623413, "learning_rate": 0.0002, "epoch": 7.4288107202680065, "step": 8870}, {"loss": 0.9834, "grad_norm": 1.3323025703430176, "learning_rate": 0.0002, "epoch": 7.437185929648241, "step": 8880}, {"loss": 1.0052, "grad_norm": 1.3253904581069946, "learning_rate": 0.0002, "epoch": 7.4455611390284755, "step": 8890}, {"loss": 1.0274, "grad_norm": 1.3685275316238403, "learning_rate": 0.0002, "epoch": 7.45393634840871, "step": 8900}, {"loss": 1.0126, "grad_norm": 1.4222962856292725, "learning_rate": 0.0002, "epoch": 7.4623115577889445, "step": 8910}, {"loss": 0.9508, "grad_norm": 1.429887056350708, "learning_rate": 0.0002, "epoch": 7.4706867671691795, "step": 8920}, {"loss": 1.0003, "grad_norm": 1.455110788345337, "learning_rate": 0.0002, "epoch": 7.4790619765494135, "step": 8930}, {"loss": 1.0206, "grad_norm": 1.298094630241394, "learning_rate": 0.0002, "epoch": 7.4874371859296485, "step": 8940}, {"loss": 1.0263, "grad_norm": 1.280696988105774, "learning_rate": 0.0002, "epoch": 7.4958123953098825, "step": 8950}, {"loss": 1.0196, "grad_norm": 1.2990771532058716, "learning_rate": 0.0002, "epoch": 7.5041876046901175, "step": 8960}, {"loss": 0.9732, "grad_norm": 1.5361275672912598, "learning_rate": 0.0002, "epoch": 7.5125628140703515, "step": 8970}, {"loss": 0.9778, "grad_norm": 1.2716164588928223, "learning_rate": 0.0002, "epoch": 7.5209380234505865, "step": 8980}, {"loss": 1.0031, "grad_norm": 1.5293556451797485, "learning_rate": 0.0002, "epoch": 7.5293132328308205, "step": 8990}, {"loss": 0.9817, "grad_norm": 1.5210952758789062, "learning_rate": 0.0002, "epoch": 7.5376884422110555, "step": 9000}, {"loss": 0.9998, "grad_norm": 1.2735507488250732, "learning_rate": 0.0002, "epoch": 7.54606365159129, "step": 9010}, {"loss": 1.001, "grad_norm": 1.3383569717407227, "learning_rate": 0.0002, "epoch": 7.5544388609715245, "step": 9020}, {"loss": 0.9423, "grad_norm": 1.471486210823059, "learning_rate": 0.0002, "epoch": 7.562814070351759, "step": 9030}, {"loss": 1.0043, "grad_norm": 1.4516266584396362, "learning_rate": 0.0002, "epoch": 7.5711892797319935, "step": 9040}, {"loss": 1.0154, "grad_norm": 1.8539457321166992, "learning_rate": 0.0002, "epoch": 7.579564489112228, "step": 9050}, {"loss": 0.9901, "grad_norm": 1.394018292427063, "learning_rate": 0.0002, "epoch": 7.5879396984924625, "step": 9060}, {"loss": 1.0031, "grad_norm": 1.4161924123764038, "learning_rate": 0.0002, "epoch": 7.596314907872697, "step": 9070}, {"loss": 1.0205, "grad_norm": 1.5264959335327148, "learning_rate": 0.0002, "epoch": 7.6046901172529315, "step": 9080}, {"loss": 0.9758, "grad_norm": 1.3996148109436035, "learning_rate": 0.0002, "epoch": 7.613065326633166, "step": 9090}, {"loss": 1.027, "grad_norm": 1.485904574394226, "learning_rate": 0.0002, "epoch": 7.6214405360134005, "step": 9100}, {"loss": 0.9973, "grad_norm": 1.361729621887207, "learning_rate": 0.0002, "epoch": 7.629815745393635, "step": 9110}, {"loss": 1.0794, "grad_norm": 1.3930991888046265, "learning_rate": 0.0002, "epoch": 7.63819095477387, "step": 9120}, {"loss": 1.0524, "grad_norm": 1.3981443643569946, "learning_rate": 0.0002, "epoch": 7.646566164154104, "step": 9130}, {"loss": 1.0171, "grad_norm": 1.325538158416748, "learning_rate": 0.0002, "epoch": 7.654941373534339, "step": 9140}, {"loss": 1.0579, "grad_norm": 1.7479078769683838, "learning_rate": 0.0002, "epoch": 7.663316582914573, "step": 9150}, {"loss": 0.9984, "grad_norm": 1.6959037780761719, "learning_rate": 0.0002, "epoch": 7.671691792294808, "step": 9160}, {"loss": 1.0603, "grad_norm": 1.218790054321289, "learning_rate": 0.0002, "epoch": 7.680067001675042, "step": 9170}, {"loss": 1.0529, "grad_norm": 1.4050689935684204, "learning_rate": 0.0002, "epoch": 7.688442211055277, "step": 9180}, {"loss": 0.9908, "grad_norm": 1.361841082572937, "learning_rate": 0.0002, "epoch": 7.696817420435511, "step": 9190}, {"loss": 1.0738, "grad_norm": 1.1516344547271729, "learning_rate": 0.0002, "epoch": 7.705192629815746, "step": 9200}, {"loss": 1.0146, "grad_norm": 1.5105586051940918, "learning_rate": 0.0002, "epoch": 7.71356783919598, "step": 9210}, {"loss": 1.0912, "grad_norm": 1.4226511716842651, "learning_rate": 0.0002, "epoch": 7.721943048576215, "step": 9220}, {"loss": 1.0109, "grad_norm": 1.4334726333618164, "learning_rate": 0.0002, "epoch": 7.730318257956449, "step": 9230}, {"loss": 0.9502, "grad_norm": 1.144550085067749, "learning_rate": 0.0002, "epoch": 7.738693467336684, "step": 9240}, {"loss": 0.9771, "grad_norm": 1.292710781097412, "learning_rate": 0.0002, "epoch": 7.747068676716918, "step": 9250}, {"loss": 1.0247, "grad_norm": 1.3884655237197876, "learning_rate": 0.0002, "epoch": 7.755443886097153, "step": 9260}, {"loss": 1.0467, "grad_norm": 1.5045685768127441, "learning_rate": 0.0002, "epoch": 7.763819095477387, "step": 9270}, {"loss": 1.0393, "grad_norm": 1.3433866500854492, "learning_rate": 0.0002, "epoch": 7.772194304857622, "step": 9280}, {"loss": 1.0634, "grad_norm": 1.4879025220870972, "learning_rate": 0.0002, "epoch": 7.780569514237856, "step": 9290}, {"loss": 0.997, "grad_norm": 1.3347378969192505, "learning_rate": 0.0002, "epoch": 7.788944723618091, "step": 9300}, {"loss": 1.017, "grad_norm": 1.3727476596832275, "learning_rate": 0.0002, "epoch": 7.797319932998325, "step": 9310}, {"loss": 1.0367, "grad_norm": 1.4126251935958862, "learning_rate": 0.0002, "epoch": 7.80569514237856, "step": 9320}, {"loss": 1.0237, "grad_norm": 1.2898106575012207, "learning_rate": 0.0002, "epoch": 7.814070351758794, "step": 9330}, {"loss": 1.0605, "grad_norm": 1.2732993364334106, "learning_rate": 0.0002, "epoch": 7.822445561139029, "step": 9340}, {"loss": 0.9851, "grad_norm": 1.1767915487289429, "learning_rate": 0.0002, "epoch": 7.830820770519263, "step": 9350}, {"loss": 1.0196, "grad_norm": 1.308590054512024, "learning_rate": 0.0002, "epoch": 7.839195979899498, "step": 9360}, {"loss": 1.0092, "grad_norm": 1.3947384357452393, "learning_rate": 0.0002, "epoch": 7.847571189279732, "step": 9370}, {"loss": 1.0313, "grad_norm": 1.3855421543121338, "learning_rate": 0.0002, "epoch": 7.855946398659967, "step": 9380}, {"loss": 1.0267, "grad_norm": 1.5742900371551514, "learning_rate": 0.0002, "epoch": 7.864321608040201, "step": 9390}, {"loss": 0.988, "grad_norm": 1.4731863737106323, "learning_rate": 0.0002, "epoch": 7.872696817420436, "step": 9400}, {"loss": 1.0387, "grad_norm": 1.5974364280700684, "learning_rate": 0.0002, "epoch": 7.88107202680067, "step": 9410}, {"loss": 1.105, "grad_norm": 1.574455738067627, "learning_rate": 0.0002, "epoch": 7.889447236180905, "step": 9420}, {"loss": 1.0818, "grad_norm": 1.3285928964614868, "learning_rate": 0.0002, "epoch": 7.897822445561139, "step": 9430}, {"loss": 1.0631, "grad_norm": 1.2003569602966309, "learning_rate": 0.0002, "epoch": 7.906197654941374, "step": 9440}, {"loss": 1.0911, "grad_norm": 1.2798550128936768, "learning_rate": 0.0002, "epoch": 7.914572864321608, "step": 9450}, {"loss": 1.0306, "grad_norm": 1.533443570137024, "learning_rate": 0.0002, "epoch": 7.922948073701843, "step": 9460}, {"loss": 1.0484, "grad_norm": 1.525195837020874, "learning_rate": 0.0002, "epoch": 7.931323283082077, "step": 9470}, {"loss": 1.0372, "grad_norm": 1.3638207912445068, "learning_rate": 0.0002, "epoch": 7.939698492462312, "step": 9480}, {"loss": 0.9591, "grad_norm": 1.4047036170959473, "learning_rate": 0.0002, "epoch": 7.948073701842546, "step": 9490}, {"loss": 1.0279, "grad_norm": 1.2534632682800293, "learning_rate": 0.0002, "epoch": 7.956448911222781, "step": 9500}, {"loss": 1.0109, "grad_norm": 1.4334971904754639, "learning_rate": 0.0002, "epoch": 7.964824120603015, "step": 9510}, {"loss": 1.0511, "grad_norm": 1.2948139905929565, "learning_rate": 0.0002, "epoch": 7.97319932998325, "step": 9520}, {"loss": 1.0856, "grad_norm": 1.3664277791976929, "learning_rate": 0.0002, "epoch": 7.981574539363484, "step": 9530}, {"loss": 1.0131, "grad_norm": 1.3293516635894775, "learning_rate": 0.0002, "epoch": 7.989949748743719, "step": 9540}, {"loss": 1.0864, "grad_norm": 1.5311461687088013, "learning_rate": 0.0002, "epoch": 7.998324958123953, "step": 9550}]}