diff --git a/.gitattributes b/.gitattributes index 49f6edef9a1aec116d76235ab64a325d754b94dc..36b931091ecf535422c544d59d956cf346d323e7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1371,3 +1371,12 @@ gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d- gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-798-sd-10000/checkpoint-49/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-798-sd-10000/checkpoint-98/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2b-it_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-798-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d94d2e18dd52d3c2cbdb2cdacd39b494f377579 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ef36e46201d1845bae06dbb20636bf471bccbedf31ab0e9dfa647101eab9b6f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..deee042638f7bbef05f413233b81c481c6d7d0fd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d350c2599b26fa99b7401b55e8fadd9f9190dcf5b52d1069ecb74d8a3d73801e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..125e8b7f3e42851db5c6aada8a7158fdf2781358 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d96cf2574485440422b346b2fc4c5ac03e53453d7a885add6a504186a68198 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..878fb6e551ae51a35f20fa135cecac2c0d054347 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ed8f8a8b4919ec08b0cb6fb7cf28b309e2a4c2a24595f37473a44d718531e91 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3581480457094eb5d528ee4f2ef8d579afb7a69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0016b4bbd258541633c486b172137b98c310916ac8294c366053f7decbbd7444 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1c079e7bf86750bcd79fec4e3cc2324ed9c1ef6c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/trainer_state.json @@ -0,0 +1,874 @@ +{ + "best_metric": 1.8086129426956177, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191", + "epoch": 0.9995803608896349, + "eval_steps": 10, + "global_step": 1191, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.513995212344525e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d94d2e18dd52d3c2cbdb2cdacd39b494f377579 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ef36e46201d1845bae06dbb20636bf471bccbedf31ab0e9dfa647101eab9b6f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..69bf62c3323d879cac224a20bbb9a60fbc413247 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d08ab91e09c16f2221f4a1f472e08a24847bc89283ba5d1b269d9c8f5332ed0a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..33faca77e4f32b5f386a8d33237a8a9b053df6c5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52490f219cae8ec709a166829314ee6c31330ff670c56380baa04e1d0bf8ed46 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..90db1c3c138d38655e32c44d74c351ea1a1bab40 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524f81a7ac3826785ef6a2e30aae2dc610717456f6fe0baf723da4edc8589dce +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9421e77077b3339e0b1f452f2af4a5248f9d38f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/trainer_state.json @@ -0,0 +1,1715 @@ +{ + "best_metric": 1.807437539100647, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2383, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + }, + { + "epoch": 1.0071338648762065, + "grad_norm": 0.3235187232494354, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1200 + }, + { + "epoch": 1.0155266470835083, + "grad_norm": 0.34884774684906006, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1210 + }, + { + "epoch": 1.0239194292908098, + "grad_norm": 0.3215438425540924, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1220 + }, + { + "epoch": 1.0323122114981116, + "grad_norm": 0.312084823846817, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1230 + }, + { + "epoch": 1.0407049937054134, + "grad_norm": 0.33597758412361145, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1240 + }, + { + "epoch": 1.0490977759127151, + "grad_norm": 0.3421499729156494, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1250 + }, + { + "epoch": 1.0574905581200167, + "grad_norm": 0.3458889126777649, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1260 + }, + { + "epoch": 1.0658833403273185, + "grad_norm": 0.3956579864025116, + "learning_rate": 0.0002, + "loss": 1.6929, + "step": 1270 + }, + { + "epoch": 1.0742761225346202, + "grad_norm": 0.3217819035053253, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1280 + }, + { + "epoch": 1.082668904741922, + "grad_norm": 0.31379663944244385, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1290 + }, + { + "epoch": 1.0910616869492236, + "grad_norm": 0.37231558561325073, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1300 + }, + { + "epoch": 1.0994544691565253, + "grad_norm": 0.35857918858528137, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 1310 + }, + { + "epoch": 1.1078472513638271, + "grad_norm": 0.36637991666793823, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.1162400335711289, + "grad_norm": 0.3436494469642639, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1330 + }, + { + "epoch": 1.1246328157784307, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 1340 + }, + { + "epoch": 1.1330255979857322, + "grad_norm": 0.34587544202804565, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 1350 + }, + { + "epoch": 1.141418380193034, + "grad_norm": 0.35142362117767334, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1360 + }, + { + "epoch": 1.1498111624003358, + "grad_norm": 0.3511804938316345, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1370 + }, + { + "epoch": 1.1582039446076373, + "grad_norm": 0.3549560308456421, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 1380 + }, + { + "epoch": 1.166596726814939, + "grad_norm": 0.35797521471977234, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1390 + }, + { + "epoch": 1.1749895090222409, + "grad_norm": 0.37255269289016724, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 1400 + }, + { + "epoch": 1.1833822912295426, + "grad_norm": 0.3680652379989624, + "learning_rate": 0.0002, + "loss": 1.7274, + "step": 1410 + }, + { + "epoch": 1.1917750734368444, + "grad_norm": 0.400831013917923, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 1420 + }, + { + "epoch": 1.200167855644146, + "grad_norm": 0.39571020007133484, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1430 + }, + { + "epoch": 1.2085606378514477, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 1440 + }, + { + "epoch": 1.2169534200587495, + "grad_norm": 0.3901960551738739, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1450 + }, + { + "epoch": 1.2253462022660513, + "grad_norm": 0.36490726470947266, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1460 + }, + { + "epoch": 1.2337389844733528, + "grad_norm": 0.3739864230155945, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1470 + }, + { + "epoch": 1.2421317666806546, + "grad_norm": 0.39061254262924194, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 1480 + }, + { + "epoch": 1.2505245488879564, + "grad_norm": 0.37198659777641296, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 1490 + }, + { + "epoch": 1.2589173310952582, + "grad_norm": 0.3420586884021759, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1500 + }, + { + "epoch": 1.2673101133025597, + "grad_norm": 0.4094347655773163, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1510 + }, + { + "epoch": 1.2757028955098615, + "grad_norm": 0.38997703790664673, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1520 + }, + { + "epoch": 1.2840956777171633, + "grad_norm": 0.35702022910118103, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 1530 + }, + { + "epoch": 1.292488459924465, + "grad_norm": 0.3892163336277008, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1540 + }, + { + "epoch": 1.3008812421317666, + "grad_norm": 0.33174318075180054, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 1550 + }, + { + "epoch": 1.3092740243390684, + "grad_norm": 0.40701809525489807, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 1560 + }, + { + "epoch": 1.3176668065463701, + "grad_norm": 0.36324232816696167, + "learning_rate": 0.0002, + "loss": 1.7229, + "step": 1570 + }, + { + "epoch": 1.326059588753672, + "grad_norm": 0.3748789429664612, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 1580 + }, + { + "epoch": 1.3344523709609737, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 1590 + }, + { + "epoch": 1.3428451531682752, + "grad_norm": 0.52373206615448, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1600 + }, + { + "epoch": 1.351237935375577, + "grad_norm": 0.40408164262771606, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1610 + }, + { + "epoch": 1.3596307175828788, + "grad_norm": 0.3818126320838928, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 1.3680234997901803, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1630 + }, + { + "epoch": 1.3764162819974821, + "grad_norm": 0.33777865767478943, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 1640 + }, + { + "epoch": 1.384809064204784, + "grad_norm": 0.36344218254089355, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1650 + }, + { + "epoch": 1.3932018464120857, + "grad_norm": 0.3880128562450409, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 1660 + }, + { + "epoch": 1.4015946286193874, + "grad_norm": 0.3906225562095642, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1670 + }, + { + "epoch": 1.409987410826689, + "grad_norm": 0.35857489705085754, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 1680 + }, + { + "epoch": 1.4183801930339908, + "grad_norm": 0.3627418279647827, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1690 + }, + { + "epoch": 1.4267729752412925, + "grad_norm": 0.41963326930999756, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1700 + }, + { + "epoch": 1.435165757448594, + "grad_norm": 0.36280378699302673, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 1.4435585396558959, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.0002, + "loss": 1.7775, + "step": 1720 + }, + { + "epoch": 1.4519513218631976, + "grad_norm": 0.3635849356651306, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 1730 + }, + { + "epoch": 1.4603441040704994, + "grad_norm": 0.4885194003582001, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1740 + }, + { + "epoch": 1.4687368862778012, + "grad_norm": 0.35194680094718933, + "learning_rate": 0.0002, + "loss": 1.6661, + "step": 1750 + }, + { + "epoch": 1.4771296684851027, + "grad_norm": 0.34906691312789917, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1760 + }, + { + "epoch": 1.4855224506924045, + "grad_norm": 0.3994184732437134, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1770 + }, + { + "epoch": 1.4939152328997063, + "grad_norm": 0.3599298298358917, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 1780 + }, + { + "epoch": 1.5023080151070078, + "grad_norm": 0.3794984221458435, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 1790 + }, + { + "epoch": 1.5107007973143096, + "grad_norm": 0.36289724707603455, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 1800 + }, + { + "epoch": 1.5190935795216114, + "grad_norm": 0.38057321310043335, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1810 + }, + { + "epoch": 1.5274863617289132, + "grad_norm": 0.3771969676017761, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 1820 + }, + { + "epoch": 1.535879143936215, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 1830 + }, + { + "epoch": 1.5442719261435167, + "grad_norm": 0.41352227330207825, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 1840 + }, + { + "epoch": 1.5526647083508183, + "grad_norm": 0.35711410641670227, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1850 + }, + { + "epoch": 1.56105749055812, + "grad_norm": 0.40607622265815735, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1860 + }, + { + "epoch": 1.5694502727654216, + "grad_norm": 0.3428550660610199, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 1870 + }, + { + "epoch": 1.5778430549727234, + "grad_norm": 0.3695414066314697, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1880 + }, + { + "epoch": 1.5862358371800251, + "grad_norm": 0.3798272907733917, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1890 + }, + { + "epoch": 1.594628619387327, + "grad_norm": 0.3415829837322235, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1900 + }, + { + "epoch": 1.6030214015946287, + "grad_norm": 0.3575693666934967, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 1910 + }, + { + "epoch": 1.6114141838019305, + "grad_norm": 0.3180370628833771, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1920 + }, + { + "epoch": 1.619806966009232, + "grad_norm": 0.5018689036369324, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1930 + }, + { + "epoch": 1.6281997482165338, + "grad_norm": 0.35676372051239014, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1940 + }, + { + "epoch": 1.6365925304238353, + "grad_norm": 0.3740452229976654, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1950 + }, + { + "epoch": 1.6449853126311371, + "grad_norm": 0.36584731936454773, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 1960 + }, + { + "epoch": 1.653378094838439, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 1970 + }, + { + "epoch": 1.6617708770457407, + "grad_norm": 0.4114968776702881, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 1980 + }, + { + "epoch": 1.6701636592530424, + "grad_norm": 0.3665498197078705, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1990 + }, + { + "epoch": 1.6785564414603442, + "grad_norm": 0.36579379439353943, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 2000 + }, + { + "epoch": 1.6869492236676458, + "grad_norm": 0.3813064694404602, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 2010 + }, + { + "epoch": 1.6953420058749475, + "grad_norm": 0.33390694856643677, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 2020 + }, + { + "epoch": 1.7037347880822493, + "grad_norm": 0.3668614327907562, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 2030 + }, + { + "epoch": 1.7121275702895509, + "grad_norm": 0.352028489112854, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2040 + }, + { + "epoch": 1.7205203524968526, + "grad_norm": 0.33639830350875854, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2050 + }, + { + "epoch": 1.7289131347041544, + "grad_norm": 0.39217695593833923, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 2060 + }, + { + "epoch": 1.7373059169114562, + "grad_norm": 0.42593324184417725, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 2070 + }, + { + "epoch": 1.745698699118758, + "grad_norm": 0.362215518951416, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 2080 + }, + { + "epoch": 1.7540914813260597, + "grad_norm": 0.4087955057621002, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2090 + }, + { + "epoch": 1.7624842635333613, + "grad_norm": 0.35127750039100647, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2100 + }, + { + "epoch": 1.770877045740663, + "grad_norm": 0.33677494525909424, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 2110 + }, + { + "epoch": 1.7792698279479646, + "grad_norm": 0.39616644382476807, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 2120 + }, + { + "epoch": 1.7876626101552664, + "grad_norm": 0.4705100953578949, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 2130 + }, + { + "epoch": 1.7960553923625682, + "grad_norm": 0.3893914818763733, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 2140 + }, + { + "epoch": 1.80444817456987, + "grad_norm": 0.3344813585281372, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 2150 + }, + { + "epoch": 1.8128409567771717, + "grad_norm": 0.36502110958099365, + "learning_rate": 0.0002, + "loss": 1.8329, + "step": 2160 + }, + { + "epoch": 1.8212337389844735, + "grad_norm": 0.3422985374927521, + "learning_rate": 0.0002, + "loss": 1.753, + "step": 2170 + }, + { + "epoch": 1.829626521191775, + "grad_norm": 0.44039851427078247, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2180 + }, + { + "epoch": 1.8380193033990768, + "grad_norm": 0.40052926540374756, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2190 + }, + { + "epoch": 1.8464120856063784, + "grad_norm": 0.3614487648010254, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 2200 + }, + { + "epoch": 1.8548048678136801, + "grad_norm": 0.3800305426120758, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2210 + }, + { + "epoch": 1.863197650020982, + "grad_norm": 0.3942040205001831, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 2220 + }, + { + "epoch": 1.8715904322282837, + "grad_norm": 0.36896875500679016, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 2230 + }, + { + "epoch": 1.8799832144355855, + "grad_norm": 0.3666089177131653, + "learning_rate": 0.0002, + "loss": 1.7371, + "step": 2240 + }, + { + "epoch": 1.8883759966428872, + "grad_norm": 0.3759142756462097, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 2250 + }, + { + "epoch": 1.8967687788501888, + "grad_norm": 0.3711695671081543, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 2260 + }, + { + "epoch": 1.9051615610574906, + "grad_norm": 0.37000006437301636, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2270 + }, + { + "epoch": 1.9135543432647921, + "grad_norm": 0.37376025319099426, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 2280 + }, + { + "epoch": 1.921947125472094, + "grad_norm": 0.3794068694114685, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 2290 + }, + { + "epoch": 1.9303399076793957, + "grad_norm": 0.42530709505081177, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 2300 + }, + { + "epoch": 1.9387326898866974, + "grad_norm": 0.3381672203540802, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 2310 + }, + { + "epoch": 1.9471254720939992, + "grad_norm": 0.3553236722946167, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 2320 + }, + { + "epoch": 1.955518254301301, + "grad_norm": 0.38204774260520935, + "learning_rate": 0.0002, + "loss": 1.715, + "step": 2330 + }, + { + "epoch": 1.9639110365086025, + "grad_norm": 0.4318946301937103, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 2340 + }, + { + "epoch": 1.9723038187159043, + "grad_norm": 0.3563119173049927, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 2350 + }, + { + "epoch": 1.980696600923206, + "grad_norm": 0.362532377243042, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 2360 + }, + { + "epoch": 1.9890893831305076, + "grad_norm": 0.40200483798980713, + "learning_rate": 0.0002, + "loss": 1.6992, + "step": 2370 + }, + { + "epoch": 1.9974821653378094, + "grad_norm": 0.37397003173828125, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.807437539100647, + "eval_runtime": 38.0038, + "eval_samples_per_second": 13.551, + "eval_steps_per_second": 1.71, + "step": 2383 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.102799042468905e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac04909995fac95c60588acf8ec401c3a3ca796c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a82b12528ba0cb094f6a7a6b5a3e121e74cbf4dde13305fb7041754c61d058b +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..68edaf58627a57d68871632fa181362ef4fa74d3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6e956fe4890a4d19ab4b9fc8c496c31c8b619afbe9dd58aeded57bb04f3aec1 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..79cc145aad3de15095e85b60aefdf869f732e53d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0078b673947e96a934cf633382b08dd1d0159eed319a28d909178cf7cffca43a +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0bbfed57ff681c988d7ebfc16ecee65581b03cf1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b48644d9aac055218ca0fc2a74d0b2fc3d06f2265fe9dfdbacab4517f1e6acfb +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..92fd37bb600473e46afecabb0acf48061f2bf3d8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/trainer_state.json @@ -0,0 +1,2556 @@ +{ + "best_metric": 1.807437539100647, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", + "epoch": 2.999580360889635, + "eval_steps": 10, + "global_step": 3574, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + }, + { + "epoch": 1.0071338648762065, + "grad_norm": 0.3235187232494354, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1200 + }, + { + "epoch": 1.0155266470835083, + "grad_norm": 0.34884774684906006, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1210 + }, + { + "epoch": 1.0239194292908098, + "grad_norm": 0.3215438425540924, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1220 + }, + { + "epoch": 1.0323122114981116, + "grad_norm": 0.312084823846817, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1230 + }, + { + "epoch": 1.0407049937054134, + "grad_norm": 0.33597758412361145, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1240 + }, + { + "epoch": 1.0490977759127151, + "grad_norm": 0.3421499729156494, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1250 + }, + { + "epoch": 1.0574905581200167, + "grad_norm": 0.3458889126777649, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1260 + }, + { + "epoch": 1.0658833403273185, + "grad_norm": 0.3956579864025116, + "learning_rate": 0.0002, + "loss": 1.6929, + "step": 1270 + }, + { + "epoch": 1.0742761225346202, + "grad_norm": 0.3217819035053253, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1280 + }, + { + "epoch": 1.082668904741922, + "grad_norm": 0.31379663944244385, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1290 + }, + { + "epoch": 1.0910616869492236, + "grad_norm": 0.37231558561325073, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1300 + }, + { + "epoch": 1.0994544691565253, + "grad_norm": 0.35857918858528137, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 1310 + }, + { + "epoch": 1.1078472513638271, + "grad_norm": 0.36637991666793823, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.1162400335711289, + "grad_norm": 0.3436494469642639, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1330 + }, + { + "epoch": 1.1246328157784307, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 1340 + }, + { + "epoch": 1.1330255979857322, + "grad_norm": 0.34587544202804565, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 1350 + }, + { + "epoch": 1.141418380193034, + "grad_norm": 0.35142362117767334, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1360 + }, + { + "epoch": 1.1498111624003358, + "grad_norm": 0.3511804938316345, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1370 + }, + { + "epoch": 1.1582039446076373, + "grad_norm": 0.3549560308456421, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 1380 + }, + { + "epoch": 1.166596726814939, + "grad_norm": 0.35797521471977234, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1390 + }, + { + "epoch": 1.1749895090222409, + "grad_norm": 0.37255269289016724, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 1400 + }, + { + "epoch": 1.1833822912295426, + "grad_norm": 0.3680652379989624, + "learning_rate": 0.0002, + "loss": 1.7274, + "step": 1410 + }, + { + "epoch": 1.1917750734368444, + "grad_norm": 0.400831013917923, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 1420 + }, + { + "epoch": 1.200167855644146, + "grad_norm": 0.39571020007133484, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1430 + }, + { + "epoch": 1.2085606378514477, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 1440 + }, + { + "epoch": 1.2169534200587495, + "grad_norm": 0.3901960551738739, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1450 + }, + { + "epoch": 1.2253462022660513, + "grad_norm": 0.36490726470947266, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1460 + }, + { + "epoch": 1.2337389844733528, + "grad_norm": 0.3739864230155945, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1470 + }, + { + "epoch": 1.2421317666806546, + "grad_norm": 0.39061254262924194, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 1480 + }, + { + "epoch": 1.2505245488879564, + "grad_norm": 0.37198659777641296, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 1490 + }, + { + "epoch": 1.2589173310952582, + "grad_norm": 0.3420586884021759, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1500 + }, + { + "epoch": 1.2673101133025597, + "grad_norm": 0.4094347655773163, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1510 + }, + { + "epoch": 1.2757028955098615, + "grad_norm": 0.38997703790664673, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1520 + }, + { + "epoch": 1.2840956777171633, + "grad_norm": 0.35702022910118103, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 1530 + }, + { + "epoch": 1.292488459924465, + "grad_norm": 0.3892163336277008, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1540 + }, + { + "epoch": 1.3008812421317666, + "grad_norm": 0.33174318075180054, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 1550 + }, + { + "epoch": 1.3092740243390684, + "grad_norm": 0.40701809525489807, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 1560 + }, + { + "epoch": 1.3176668065463701, + "grad_norm": 0.36324232816696167, + "learning_rate": 0.0002, + "loss": 1.7229, + "step": 1570 + }, + { + "epoch": 1.326059588753672, + "grad_norm": 0.3748789429664612, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 1580 + }, + { + "epoch": 1.3344523709609737, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 1590 + }, + { + "epoch": 1.3428451531682752, + "grad_norm": 0.52373206615448, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1600 + }, + { + "epoch": 1.351237935375577, + "grad_norm": 0.40408164262771606, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1610 + }, + { + "epoch": 1.3596307175828788, + "grad_norm": 0.3818126320838928, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 1.3680234997901803, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1630 + }, + { + "epoch": 1.3764162819974821, + "grad_norm": 0.33777865767478943, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 1640 + }, + { + "epoch": 1.384809064204784, + "grad_norm": 0.36344218254089355, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1650 + }, + { + "epoch": 1.3932018464120857, + "grad_norm": 0.3880128562450409, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 1660 + }, + { + "epoch": 1.4015946286193874, + "grad_norm": 0.3906225562095642, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1670 + }, + { + "epoch": 1.409987410826689, + "grad_norm": 0.35857489705085754, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 1680 + }, + { + "epoch": 1.4183801930339908, + "grad_norm": 0.3627418279647827, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1690 + }, + { + "epoch": 1.4267729752412925, + "grad_norm": 0.41963326930999756, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1700 + }, + { + "epoch": 1.435165757448594, + "grad_norm": 0.36280378699302673, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 1.4435585396558959, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.0002, + "loss": 1.7775, + "step": 1720 + }, + { + "epoch": 1.4519513218631976, + "grad_norm": 0.3635849356651306, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 1730 + }, + { + "epoch": 1.4603441040704994, + "grad_norm": 0.4885194003582001, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1740 + }, + { + "epoch": 1.4687368862778012, + "grad_norm": 0.35194680094718933, + "learning_rate": 0.0002, + "loss": 1.6661, + "step": 1750 + }, + { + "epoch": 1.4771296684851027, + "grad_norm": 0.34906691312789917, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1760 + }, + { + "epoch": 1.4855224506924045, + "grad_norm": 0.3994184732437134, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1770 + }, + { + "epoch": 1.4939152328997063, + "grad_norm": 0.3599298298358917, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 1780 + }, + { + "epoch": 1.5023080151070078, + "grad_norm": 0.3794984221458435, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 1790 + }, + { + "epoch": 1.5107007973143096, + "grad_norm": 0.36289724707603455, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 1800 + }, + { + "epoch": 1.5190935795216114, + "grad_norm": 0.38057321310043335, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1810 + }, + { + "epoch": 1.5274863617289132, + "grad_norm": 0.3771969676017761, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 1820 + }, + { + "epoch": 1.535879143936215, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 1830 + }, + { + "epoch": 1.5442719261435167, + "grad_norm": 0.41352227330207825, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 1840 + }, + { + "epoch": 1.5526647083508183, + "grad_norm": 0.35711410641670227, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1850 + }, + { + "epoch": 1.56105749055812, + "grad_norm": 0.40607622265815735, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1860 + }, + { + "epoch": 1.5694502727654216, + "grad_norm": 0.3428550660610199, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 1870 + }, + { + "epoch": 1.5778430549727234, + "grad_norm": 0.3695414066314697, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1880 + }, + { + "epoch": 1.5862358371800251, + "grad_norm": 0.3798272907733917, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1890 + }, + { + "epoch": 1.594628619387327, + "grad_norm": 0.3415829837322235, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1900 + }, + { + "epoch": 1.6030214015946287, + "grad_norm": 0.3575693666934967, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 1910 + }, + { + "epoch": 1.6114141838019305, + "grad_norm": 0.3180370628833771, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1920 + }, + { + "epoch": 1.619806966009232, + "grad_norm": 0.5018689036369324, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1930 + }, + { + "epoch": 1.6281997482165338, + "grad_norm": 0.35676372051239014, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1940 + }, + { + "epoch": 1.6365925304238353, + "grad_norm": 0.3740452229976654, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1950 + }, + { + "epoch": 1.6449853126311371, + "grad_norm": 0.36584731936454773, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 1960 + }, + { + "epoch": 1.653378094838439, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 1970 + }, + { + "epoch": 1.6617708770457407, + "grad_norm": 0.4114968776702881, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 1980 + }, + { + "epoch": 1.6701636592530424, + "grad_norm": 0.3665498197078705, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1990 + }, + { + "epoch": 1.6785564414603442, + "grad_norm": 0.36579379439353943, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 2000 + }, + { + "epoch": 1.6869492236676458, + "grad_norm": 0.3813064694404602, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 2010 + }, + { + "epoch": 1.6953420058749475, + "grad_norm": 0.33390694856643677, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 2020 + }, + { + "epoch": 1.7037347880822493, + "grad_norm": 0.3668614327907562, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 2030 + }, + { + "epoch": 1.7121275702895509, + "grad_norm": 0.352028489112854, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2040 + }, + { + "epoch": 1.7205203524968526, + "grad_norm": 0.33639830350875854, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2050 + }, + { + "epoch": 1.7289131347041544, + "grad_norm": 0.39217695593833923, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 2060 + }, + { + "epoch": 1.7373059169114562, + "grad_norm": 0.42593324184417725, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 2070 + }, + { + "epoch": 1.745698699118758, + "grad_norm": 0.362215518951416, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 2080 + }, + { + "epoch": 1.7540914813260597, + "grad_norm": 0.4087955057621002, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2090 + }, + { + "epoch": 1.7624842635333613, + "grad_norm": 0.35127750039100647, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2100 + }, + { + "epoch": 1.770877045740663, + "grad_norm": 0.33677494525909424, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 2110 + }, + { + "epoch": 1.7792698279479646, + "grad_norm": 0.39616644382476807, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 2120 + }, + { + "epoch": 1.7876626101552664, + "grad_norm": 0.4705100953578949, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 2130 + }, + { + "epoch": 1.7960553923625682, + "grad_norm": 0.3893914818763733, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 2140 + }, + { + "epoch": 1.80444817456987, + "grad_norm": 0.3344813585281372, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 2150 + }, + { + "epoch": 1.8128409567771717, + "grad_norm": 0.36502110958099365, + "learning_rate": 0.0002, + "loss": 1.8329, + "step": 2160 + }, + { + "epoch": 1.8212337389844735, + "grad_norm": 0.3422985374927521, + "learning_rate": 0.0002, + "loss": 1.753, + "step": 2170 + }, + { + "epoch": 1.829626521191775, + "grad_norm": 0.44039851427078247, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2180 + }, + { + "epoch": 1.8380193033990768, + "grad_norm": 0.40052926540374756, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2190 + }, + { + "epoch": 1.8464120856063784, + "grad_norm": 0.3614487648010254, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 2200 + }, + { + "epoch": 1.8548048678136801, + "grad_norm": 0.3800305426120758, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2210 + }, + { + "epoch": 1.863197650020982, + "grad_norm": 0.3942040205001831, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 2220 + }, + { + "epoch": 1.8715904322282837, + "grad_norm": 0.36896875500679016, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 2230 + }, + { + "epoch": 1.8799832144355855, + "grad_norm": 0.3666089177131653, + "learning_rate": 0.0002, + "loss": 1.7371, + "step": 2240 + }, + { + "epoch": 1.8883759966428872, + "grad_norm": 0.3759142756462097, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 2250 + }, + { + "epoch": 1.8967687788501888, + "grad_norm": 0.3711695671081543, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 2260 + }, + { + "epoch": 1.9051615610574906, + "grad_norm": 0.37000006437301636, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2270 + }, + { + "epoch": 1.9135543432647921, + "grad_norm": 0.37376025319099426, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 2280 + }, + { + "epoch": 1.921947125472094, + "grad_norm": 0.3794068694114685, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 2290 + }, + { + "epoch": 1.9303399076793957, + "grad_norm": 0.42530709505081177, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 2300 + }, + { + "epoch": 1.9387326898866974, + "grad_norm": 0.3381672203540802, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 2310 + }, + { + "epoch": 1.9471254720939992, + "grad_norm": 0.3553236722946167, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 2320 + }, + { + "epoch": 1.955518254301301, + "grad_norm": 0.38204774260520935, + "learning_rate": 0.0002, + "loss": 1.715, + "step": 2330 + }, + { + "epoch": 1.9639110365086025, + "grad_norm": 0.4318946301937103, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 2340 + }, + { + "epoch": 1.9723038187159043, + "grad_norm": 0.3563119173049927, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 2350 + }, + { + "epoch": 1.980696600923206, + "grad_norm": 0.362532377243042, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 2360 + }, + { + "epoch": 1.9890893831305076, + "grad_norm": 0.40200483798980713, + "learning_rate": 0.0002, + "loss": 1.6992, + "step": 2370 + }, + { + "epoch": 1.9974821653378094, + "grad_norm": 0.37397003173828125, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.807437539100647, + "eval_runtime": 38.0038, + "eval_samples_per_second": 13.551, + "eval_steps_per_second": 1.71, + "step": 2383 + }, + { + "epoch": 2.005874947545111, + "grad_norm": 0.3563518226146698, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2390 + }, + { + "epoch": 2.014267729752413, + "grad_norm": 0.3913732171058655, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 2400 + }, + { + "epoch": 2.0226605119597147, + "grad_norm": 0.3511047661304474, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 2410 + }, + { + "epoch": 2.0310532941670165, + "grad_norm": 0.3917897641658783, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 2420 + }, + { + "epoch": 2.0394460763743183, + "grad_norm": 0.36766913533210754, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2430 + }, + { + "epoch": 2.0478388585816196, + "grad_norm": 0.434097021818161, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 2440 + }, + { + "epoch": 2.0562316407889214, + "grad_norm": 0.4986756145954132, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 2450 + }, + { + "epoch": 2.064624422996223, + "grad_norm": 0.4377020001411438, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 2460 + }, + { + "epoch": 2.073017205203525, + "grad_norm": 0.4412095546722412, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2470 + }, + { + "epoch": 2.0814099874108267, + "grad_norm": 0.4463737905025482, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 2480 + }, + { + "epoch": 2.0898027696181285, + "grad_norm": 0.4118853211402893, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 2490 + }, + { + "epoch": 2.0981955518254303, + "grad_norm": 0.48814308643341064, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 2500 + }, + { + "epoch": 2.106588334032732, + "grad_norm": 0.4263038635253906, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2510 + }, + { + "epoch": 2.1149811162400334, + "grad_norm": 0.41060999035835266, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2520 + }, + { + "epoch": 2.123373898447335, + "grad_norm": 0.4699285626411438, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 2530 + }, + { + "epoch": 2.131766680654637, + "grad_norm": 0.4321298897266388, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2540 + }, + { + "epoch": 2.1401594628619387, + "grad_norm": 0.41544368863105774, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 2550 + }, + { + "epoch": 2.1485522450692405, + "grad_norm": 0.4529191851615906, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2560 + }, + { + "epoch": 2.1569450272765422, + "grad_norm": 0.4370215833187103, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2570 + }, + { + "epoch": 2.165337809483844, + "grad_norm": 0.3878629207611084, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 2580 + }, + { + "epoch": 2.173730591691146, + "grad_norm": 0.47374191880226135, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2590 + }, + { + "epoch": 2.182123373898447, + "grad_norm": 0.4551556706428528, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 2600 + }, + { + "epoch": 2.190516156105749, + "grad_norm": 0.45371633768081665, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 2610 + }, + { + "epoch": 2.1989089383130507, + "grad_norm": 0.3831859529018402, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2620 + }, + { + "epoch": 2.2073017205203525, + "grad_norm": 0.42436569929122925, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2630 + }, + { + "epoch": 2.2156945027276542, + "grad_norm": 0.4363750219345093, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2640 + }, + { + "epoch": 2.224087284934956, + "grad_norm": 0.4473390579223633, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 2650 + }, + { + "epoch": 2.2324800671422578, + "grad_norm": 0.4419533908367157, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 2660 + }, + { + "epoch": 2.2408728493495595, + "grad_norm": 0.525901198387146, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 2670 + }, + { + "epoch": 2.2492656315568613, + "grad_norm": 0.4345211684703827, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 2680 + }, + { + "epoch": 2.2576584137641627, + "grad_norm": 0.5169841051101685, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 2690 + }, + { + "epoch": 2.2660511959714644, + "grad_norm": 0.43511003255844116, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2700 + }, + { + "epoch": 2.274443978178766, + "grad_norm": 0.4781411588191986, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2710 + }, + { + "epoch": 2.282836760386068, + "grad_norm": 0.4282242953777313, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2720 + }, + { + "epoch": 2.2912295425933698, + "grad_norm": 0.4499875605106354, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 2730 + }, + { + "epoch": 2.2996223248006715, + "grad_norm": 0.4133218824863434, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2740 + }, + { + "epoch": 2.3080151070079733, + "grad_norm": 0.4706156849861145, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2750 + }, + { + "epoch": 2.3164078892152746, + "grad_norm": 0.4537484347820282, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.3248006714225764, + "grad_norm": 0.39736735820770264, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2770 + }, + { + "epoch": 2.333193453629878, + "grad_norm": 0.4488453269004822, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2780 + }, + { + "epoch": 2.34158623583718, + "grad_norm": 0.44405487179756165, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2790 + }, + { + "epoch": 2.3499790180444817, + "grad_norm": 0.4726555049419403, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 2800 + }, + { + "epoch": 2.3583718002517835, + "grad_norm": 0.4820375442504883, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2810 + }, + { + "epoch": 2.3667645824590853, + "grad_norm": 0.46176597476005554, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 2820 + }, + { + "epoch": 2.375157364666387, + "grad_norm": 0.4603394567966461, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 2830 + }, + { + "epoch": 2.383550146873689, + "grad_norm": 0.4462946355342865, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 2840 + }, + { + "epoch": 2.39194292908099, + "grad_norm": 0.5216080546379089, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 2850 + }, + { + "epoch": 2.400335711288292, + "grad_norm": 0.44553086161613464, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2860 + }, + { + "epoch": 2.4087284934955937, + "grad_norm": 0.4215725362300873, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2870 + }, + { + "epoch": 2.4171212757028955, + "grad_norm": 0.4646450877189636, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2880 + }, + { + "epoch": 2.4255140579101973, + "grad_norm": 0.44749370217323303, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2890 + }, + { + "epoch": 2.433906840117499, + "grad_norm": 0.4986693859100342, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2900 + }, + { + "epoch": 2.442299622324801, + "grad_norm": 0.4607609808444977, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 2910 + }, + { + "epoch": 2.4506924045321026, + "grad_norm": 0.4597654938697815, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 2920 + }, + { + "epoch": 2.4590851867394043, + "grad_norm": 0.4106820821762085, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 2930 + }, + { + "epoch": 2.4674779689467057, + "grad_norm": 0.4531514048576355, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 2940 + }, + { + "epoch": 2.4758707511540075, + "grad_norm": 0.4546769857406616, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2950 + }, + { + "epoch": 2.4842635333613092, + "grad_norm": 0.47410622239112854, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 2960 + }, + { + "epoch": 2.492656315568611, + "grad_norm": 0.4498177468776703, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 2970 + }, + { + "epoch": 2.5010490977759128, + "grad_norm": 0.47267791628837585, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 2980 + }, + { + "epoch": 2.5094418799832146, + "grad_norm": 0.4340207576751709, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 2990 + }, + { + "epoch": 2.5178346621905163, + "grad_norm": 0.43454936146736145, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3000 + }, + { + "epoch": 2.5262274443978177, + "grad_norm": 0.43459394574165344, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3010 + }, + { + "epoch": 2.5346202266051194, + "grad_norm": 0.4716770052909851, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.543013008812421, + "grad_norm": 0.4339194595813751, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 3030 + }, + { + "epoch": 2.551405791019723, + "grad_norm": 0.4655593931674957, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3040 + }, + { + "epoch": 2.5597985732270248, + "grad_norm": 0.5480475425720215, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3050 + }, + { + "epoch": 2.5681913554343265, + "grad_norm": 0.4783174991607666, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3060 + }, + { + "epoch": 2.5765841376416283, + "grad_norm": 0.45062026381492615, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 3070 + }, + { + "epoch": 2.58497691984893, + "grad_norm": 0.4559392035007477, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 3080 + }, + { + "epoch": 2.593369702056232, + "grad_norm": 0.6581618785858154, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 3090 + }, + { + "epoch": 2.601762484263533, + "grad_norm": 0.48549333214759827, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 3100 + }, + { + "epoch": 2.610155266470835, + "grad_norm": 0.5358436107635498, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 3110 + }, + { + "epoch": 2.6185480486781367, + "grad_norm": 0.5380043983459473, + "learning_rate": 0.0002, + "loss": 1.6507, + "step": 3120 + }, + { + "epoch": 2.6269408308854385, + "grad_norm": 0.49887847900390625, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 3130 + }, + { + "epoch": 2.6353336130927403, + "grad_norm": 0.46039602160453796, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 3140 + }, + { + "epoch": 2.643726395300042, + "grad_norm": 0.416098952293396, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3150 + }, + { + "epoch": 2.652119177507344, + "grad_norm": 0.465326726436615, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 3160 + }, + { + "epoch": 2.660511959714645, + "grad_norm": 0.47029924392700195, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3170 + }, + { + "epoch": 2.6689047419219474, + "grad_norm": 0.5063307285308838, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 3180 + }, + { + "epoch": 2.6772975241292487, + "grad_norm": 0.42928868532180786, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3190 + }, + { + "epoch": 2.6856903063365505, + "grad_norm": 0.4170134365558624, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3200 + }, + { + "epoch": 2.6940830885438523, + "grad_norm": 0.47810474038124084, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3210 + }, + { + "epoch": 2.702475870751154, + "grad_norm": 0.44440609216690063, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 3220 + }, + { + "epoch": 2.710868652958456, + "grad_norm": 0.482759565114975, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 3230 + }, + { + "epoch": 2.7192614351657576, + "grad_norm": 0.4325942099094391, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 3240 + }, + { + "epoch": 2.7276542173730594, + "grad_norm": 0.502498984336853, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 3250 + }, + { + "epoch": 2.7360469995803607, + "grad_norm": 0.4725162982940674, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3260 + }, + { + "epoch": 2.7444397817876625, + "grad_norm": 0.46781349182128906, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 3270 + }, + { + "epoch": 2.7528325639949642, + "grad_norm": 0.47366851568222046, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3280 + }, + { + "epoch": 2.761225346202266, + "grad_norm": 0.5101882815361023, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 3290 + }, + { + "epoch": 2.769618128409568, + "grad_norm": 0.4874587059020996, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 3300 + }, + { + "epoch": 2.7780109106168696, + "grad_norm": 0.4989369213581085, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3310 + }, + { + "epoch": 2.7864036928241713, + "grad_norm": 0.48041442036628723, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 3320 + }, + { + "epoch": 2.7947964750314727, + "grad_norm": 0.4845651090145111, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 3330 + }, + { + "epoch": 2.803189257238775, + "grad_norm": 0.48575496673583984, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 3340 + }, + { + "epoch": 2.811582039446076, + "grad_norm": 0.509726881980896, + "learning_rate": 0.0002, + "loss": 1.6771, + "step": 3350 + }, + { + "epoch": 2.819974821653378, + "grad_norm": 0.5026665329933167, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 3360 + }, + { + "epoch": 2.8283676038606798, + "grad_norm": 0.4727601706981659, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 3370 + }, + { + "epoch": 2.8367603860679815, + "grad_norm": 0.41952234506607056, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 3380 + }, + { + "epoch": 2.8451531682752833, + "grad_norm": 0.49663856625556946, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 3390 + }, + { + "epoch": 2.853545950482585, + "grad_norm": 0.4934511184692383, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 3400 + }, + { + "epoch": 2.861938732689887, + "grad_norm": 0.4673226773738861, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3410 + }, + { + "epoch": 2.870331514897188, + "grad_norm": 0.48972779512405396, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 3420 + }, + { + "epoch": 2.8787242971044904, + "grad_norm": 0.5008330345153809, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3430 + }, + { + "epoch": 2.8871170793117917, + "grad_norm": 0.43337664008140564, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 3440 + }, + { + "epoch": 2.8955098615190935, + "grad_norm": 0.4430622458457947, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 3450 + }, + { + "epoch": 2.9039026437263953, + "grad_norm": 0.45123326778411865, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3460 + }, + { + "epoch": 2.912295425933697, + "grad_norm": 0.47367340326309204, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 3470 + }, + { + "epoch": 2.920688208140999, + "grad_norm": 0.44940701127052307, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3480 + }, + { + "epoch": 2.9290809903483006, + "grad_norm": 0.44216281175613403, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3490 + }, + { + "epoch": 2.9374737725556024, + "grad_norm": 0.4824782609939575, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3500 + }, + { + "epoch": 2.9458665547629037, + "grad_norm": 0.43067067861557007, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 3510 + }, + { + "epoch": 2.9542593369702055, + "grad_norm": 0.46483176946640015, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3520 + }, + { + "epoch": 2.9626521191775073, + "grad_norm": 0.49230799078941345, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3530 + }, + { + "epoch": 2.971044901384809, + "grad_norm": 0.5081011652946472, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3540 + }, + { + "epoch": 2.979437683592111, + "grad_norm": 0.5326072573661804, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 3550 + }, + { + "epoch": 2.9878304657994126, + "grad_norm": 0.4981454014778137, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3560 + }, + { + "epoch": 2.9962232480067144, + "grad_norm": 0.4330528676509857, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3570 + }, + { + "epoch": 2.999580360889635, + "eval_loss": 1.824695348739624, + "eval_runtime": 37.947, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3574 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6541985637033574e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-3574/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9bc8ff3dd8780af7c91fc0934e746474173ef38f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ecea9fd41ee4efcf31c55cc213e81197671a344baf7af9b182ba35522ae859c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8c69be7b8a0703049448698a2ada15d2be4b1ce --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9bf4551f4b0fcbcb2cbbfc9f82546c0f2bd0f360423c36ce8c6e424666000c +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..91b091707dd44a954ae020cb843ea5def3583ca3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5e8ede0d8d151410c9ff1e7dcc960ee3124584f43a7297bddfa3c6e49cea94 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7c222833006b3a603db40391fa2121ef5d01090 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c9a51bfec08a4508ed6519aa4bea8664c78ae1f9cac0fa0bc190b357c262851 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..550cebd576e02b6e067da7d35d080ebce30d29d6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/trainer_state.json @@ -0,0 +1,3397 @@ +{ + "best_metric": 1.807437539100647, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 4766, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + }, + { + "epoch": 1.0071338648762065, + "grad_norm": 0.3235187232494354, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1200 + }, + { + "epoch": 1.0155266470835083, + "grad_norm": 0.34884774684906006, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1210 + }, + { + "epoch": 1.0239194292908098, + "grad_norm": 0.3215438425540924, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1220 + }, + { + "epoch": 1.0323122114981116, + "grad_norm": 0.312084823846817, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1230 + }, + { + "epoch": 1.0407049937054134, + "grad_norm": 0.33597758412361145, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1240 + }, + { + "epoch": 1.0490977759127151, + "grad_norm": 0.3421499729156494, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1250 + }, + { + "epoch": 1.0574905581200167, + "grad_norm": 0.3458889126777649, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1260 + }, + { + "epoch": 1.0658833403273185, + "grad_norm": 0.3956579864025116, + "learning_rate": 0.0002, + "loss": 1.6929, + "step": 1270 + }, + { + "epoch": 1.0742761225346202, + "grad_norm": 0.3217819035053253, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1280 + }, + { + "epoch": 1.082668904741922, + "grad_norm": 0.31379663944244385, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1290 + }, + { + "epoch": 1.0910616869492236, + "grad_norm": 0.37231558561325073, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1300 + }, + { + "epoch": 1.0994544691565253, + "grad_norm": 0.35857918858528137, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 1310 + }, + { + "epoch": 1.1078472513638271, + "grad_norm": 0.36637991666793823, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.1162400335711289, + "grad_norm": 0.3436494469642639, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1330 + }, + { + "epoch": 1.1246328157784307, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 1340 + }, + { + "epoch": 1.1330255979857322, + "grad_norm": 0.34587544202804565, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 1350 + }, + { + "epoch": 1.141418380193034, + "grad_norm": 0.35142362117767334, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1360 + }, + { + "epoch": 1.1498111624003358, + "grad_norm": 0.3511804938316345, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1370 + }, + { + "epoch": 1.1582039446076373, + "grad_norm": 0.3549560308456421, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 1380 + }, + { + "epoch": 1.166596726814939, + "grad_norm": 0.35797521471977234, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1390 + }, + { + "epoch": 1.1749895090222409, + "grad_norm": 0.37255269289016724, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 1400 + }, + { + "epoch": 1.1833822912295426, + "grad_norm": 0.3680652379989624, + "learning_rate": 0.0002, + "loss": 1.7274, + "step": 1410 + }, + { + "epoch": 1.1917750734368444, + "grad_norm": 0.400831013917923, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 1420 + }, + { + "epoch": 1.200167855644146, + "grad_norm": 0.39571020007133484, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1430 + }, + { + "epoch": 1.2085606378514477, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 1440 + }, + { + "epoch": 1.2169534200587495, + "grad_norm": 0.3901960551738739, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1450 + }, + { + "epoch": 1.2253462022660513, + "grad_norm": 0.36490726470947266, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1460 + }, + { + "epoch": 1.2337389844733528, + "grad_norm": 0.3739864230155945, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1470 + }, + { + "epoch": 1.2421317666806546, + "grad_norm": 0.39061254262924194, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 1480 + }, + { + "epoch": 1.2505245488879564, + "grad_norm": 0.37198659777641296, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 1490 + }, + { + "epoch": 1.2589173310952582, + "grad_norm": 0.3420586884021759, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1500 + }, + { + "epoch": 1.2673101133025597, + "grad_norm": 0.4094347655773163, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1510 + }, + { + "epoch": 1.2757028955098615, + "grad_norm": 0.38997703790664673, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1520 + }, + { + "epoch": 1.2840956777171633, + "grad_norm": 0.35702022910118103, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 1530 + }, + { + "epoch": 1.292488459924465, + "grad_norm": 0.3892163336277008, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1540 + }, + { + "epoch": 1.3008812421317666, + "grad_norm": 0.33174318075180054, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 1550 + }, + { + "epoch": 1.3092740243390684, + "grad_norm": 0.40701809525489807, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 1560 + }, + { + "epoch": 1.3176668065463701, + "grad_norm": 0.36324232816696167, + "learning_rate": 0.0002, + "loss": 1.7229, + "step": 1570 + }, + { + "epoch": 1.326059588753672, + "grad_norm": 0.3748789429664612, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 1580 + }, + { + "epoch": 1.3344523709609737, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 1590 + }, + { + "epoch": 1.3428451531682752, + "grad_norm": 0.52373206615448, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1600 + }, + { + "epoch": 1.351237935375577, + "grad_norm": 0.40408164262771606, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1610 + }, + { + "epoch": 1.3596307175828788, + "grad_norm": 0.3818126320838928, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 1.3680234997901803, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1630 + }, + { + "epoch": 1.3764162819974821, + "grad_norm": 0.33777865767478943, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 1640 + }, + { + "epoch": 1.384809064204784, + "grad_norm": 0.36344218254089355, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1650 + }, + { + "epoch": 1.3932018464120857, + "grad_norm": 0.3880128562450409, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 1660 + }, + { + "epoch": 1.4015946286193874, + "grad_norm": 0.3906225562095642, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1670 + }, + { + "epoch": 1.409987410826689, + "grad_norm": 0.35857489705085754, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 1680 + }, + { + "epoch": 1.4183801930339908, + "grad_norm": 0.3627418279647827, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1690 + }, + { + "epoch": 1.4267729752412925, + "grad_norm": 0.41963326930999756, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1700 + }, + { + "epoch": 1.435165757448594, + "grad_norm": 0.36280378699302673, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 1.4435585396558959, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.0002, + "loss": 1.7775, + "step": 1720 + }, + { + "epoch": 1.4519513218631976, + "grad_norm": 0.3635849356651306, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 1730 + }, + { + "epoch": 1.4603441040704994, + "grad_norm": 0.4885194003582001, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1740 + }, + { + "epoch": 1.4687368862778012, + "grad_norm": 0.35194680094718933, + "learning_rate": 0.0002, + "loss": 1.6661, + "step": 1750 + }, + { + "epoch": 1.4771296684851027, + "grad_norm": 0.34906691312789917, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1760 + }, + { + "epoch": 1.4855224506924045, + "grad_norm": 0.3994184732437134, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1770 + }, + { + "epoch": 1.4939152328997063, + "grad_norm": 0.3599298298358917, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 1780 + }, + { + "epoch": 1.5023080151070078, + "grad_norm": 0.3794984221458435, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 1790 + }, + { + "epoch": 1.5107007973143096, + "grad_norm": 0.36289724707603455, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 1800 + }, + { + "epoch": 1.5190935795216114, + "grad_norm": 0.38057321310043335, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1810 + }, + { + "epoch": 1.5274863617289132, + "grad_norm": 0.3771969676017761, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 1820 + }, + { + "epoch": 1.535879143936215, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 1830 + }, + { + "epoch": 1.5442719261435167, + "grad_norm": 0.41352227330207825, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 1840 + }, + { + "epoch": 1.5526647083508183, + "grad_norm": 0.35711410641670227, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1850 + }, + { + "epoch": 1.56105749055812, + "grad_norm": 0.40607622265815735, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1860 + }, + { + "epoch": 1.5694502727654216, + "grad_norm": 0.3428550660610199, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 1870 + }, + { + "epoch": 1.5778430549727234, + "grad_norm": 0.3695414066314697, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1880 + }, + { + "epoch": 1.5862358371800251, + "grad_norm": 0.3798272907733917, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1890 + }, + { + "epoch": 1.594628619387327, + "grad_norm": 0.3415829837322235, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1900 + }, + { + "epoch": 1.6030214015946287, + "grad_norm": 0.3575693666934967, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 1910 + }, + { + "epoch": 1.6114141838019305, + "grad_norm": 0.3180370628833771, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1920 + }, + { + "epoch": 1.619806966009232, + "grad_norm": 0.5018689036369324, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1930 + }, + { + "epoch": 1.6281997482165338, + "grad_norm": 0.35676372051239014, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1940 + }, + { + "epoch": 1.6365925304238353, + "grad_norm": 0.3740452229976654, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1950 + }, + { + "epoch": 1.6449853126311371, + "grad_norm": 0.36584731936454773, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 1960 + }, + { + "epoch": 1.653378094838439, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 1970 + }, + { + "epoch": 1.6617708770457407, + "grad_norm": 0.4114968776702881, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 1980 + }, + { + "epoch": 1.6701636592530424, + "grad_norm": 0.3665498197078705, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1990 + }, + { + "epoch": 1.6785564414603442, + "grad_norm": 0.36579379439353943, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 2000 + }, + { + "epoch": 1.6869492236676458, + "grad_norm": 0.3813064694404602, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 2010 + }, + { + "epoch": 1.6953420058749475, + "grad_norm": 0.33390694856643677, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 2020 + }, + { + "epoch": 1.7037347880822493, + "grad_norm": 0.3668614327907562, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 2030 + }, + { + "epoch": 1.7121275702895509, + "grad_norm": 0.352028489112854, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2040 + }, + { + "epoch": 1.7205203524968526, + "grad_norm": 0.33639830350875854, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2050 + }, + { + "epoch": 1.7289131347041544, + "grad_norm": 0.39217695593833923, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 2060 + }, + { + "epoch": 1.7373059169114562, + "grad_norm": 0.42593324184417725, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 2070 + }, + { + "epoch": 1.745698699118758, + "grad_norm": 0.362215518951416, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 2080 + }, + { + "epoch": 1.7540914813260597, + "grad_norm": 0.4087955057621002, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2090 + }, + { + "epoch": 1.7624842635333613, + "grad_norm": 0.35127750039100647, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2100 + }, + { + "epoch": 1.770877045740663, + "grad_norm": 0.33677494525909424, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 2110 + }, + { + "epoch": 1.7792698279479646, + "grad_norm": 0.39616644382476807, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 2120 + }, + { + "epoch": 1.7876626101552664, + "grad_norm": 0.4705100953578949, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 2130 + }, + { + "epoch": 1.7960553923625682, + "grad_norm": 0.3893914818763733, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 2140 + }, + { + "epoch": 1.80444817456987, + "grad_norm": 0.3344813585281372, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 2150 + }, + { + "epoch": 1.8128409567771717, + "grad_norm": 0.36502110958099365, + "learning_rate": 0.0002, + "loss": 1.8329, + "step": 2160 + }, + { + "epoch": 1.8212337389844735, + "grad_norm": 0.3422985374927521, + "learning_rate": 0.0002, + "loss": 1.753, + "step": 2170 + }, + { + "epoch": 1.829626521191775, + "grad_norm": 0.44039851427078247, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2180 + }, + { + "epoch": 1.8380193033990768, + "grad_norm": 0.40052926540374756, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2190 + }, + { + "epoch": 1.8464120856063784, + "grad_norm": 0.3614487648010254, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 2200 + }, + { + "epoch": 1.8548048678136801, + "grad_norm": 0.3800305426120758, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2210 + }, + { + "epoch": 1.863197650020982, + "grad_norm": 0.3942040205001831, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 2220 + }, + { + "epoch": 1.8715904322282837, + "grad_norm": 0.36896875500679016, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 2230 + }, + { + "epoch": 1.8799832144355855, + "grad_norm": 0.3666089177131653, + "learning_rate": 0.0002, + "loss": 1.7371, + "step": 2240 + }, + { + "epoch": 1.8883759966428872, + "grad_norm": 0.3759142756462097, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 2250 + }, + { + "epoch": 1.8967687788501888, + "grad_norm": 0.3711695671081543, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 2260 + }, + { + "epoch": 1.9051615610574906, + "grad_norm": 0.37000006437301636, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2270 + }, + { + "epoch": 1.9135543432647921, + "grad_norm": 0.37376025319099426, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 2280 + }, + { + "epoch": 1.921947125472094, + "grad_norm": 0.3794068694114685, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 2290 + }, + { + "epoch": 1.9303399076793957, + "grad_norm": 0.42530709505081177, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 2300 + }, + { + "epoch": 1.9387326898866974, + "grad_norm": 0.3381672203540802, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 2310 + }, + { + "epoch": 1.9471254720939992, + "grad_norm": 0.3553236722946167, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 2320 + }, + { + "epoch": 1.955518254301301, + "grad_norm": 0.38204774260520935, + "learning_rate": 0.0002, + "loss": 1.715, + "step": 2330 + }, + { + "epoch": 1.9639110365086025, + "grad_norm": 0.4318946301937103, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 2340 + }, + { + "epoch": 1.9723038187159043, + "grad_norm": 0.3563119173049927, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 2350 + }, + { + "epoch": 1.980696600923206, + "grad_norm": 0.362532377243042, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 2360 + }, + { + "epoch": 1.9890893831305076, + "grad_norm": 0.40200483798980713, + "learning_rate": 0.0002, + "loss": 1.6992, + "step": 2370 + }, + { + "epoch": 1.9974821653378094, + "grad_norm": 0.37397003173828125, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.807437539100647, + "eval_runtime": 38.0038, + "eval_samples_per_second": 13.551, + "eval_steps_per_second": 1.71, + "step": 2383 + }, + { + "epoch": 2.005874947545111, + "grad_norm": 0.3563518226146698, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2390 + }, + { + "epoch": 2.014267729752413, + "grad_norm": 0.3913732171058655, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 2400 + }, + { + "epoch": 2.0226605119597147, + "grad_norm": 0.3511047661304474, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 2410 + }, + { + "epoch": 2.0310532941670165, + "grad_norm": 0.3917897641658783, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 2420 + }, + { + "epoch": 2.0394460763743183, + "grad_norm": 0.36766913533210754, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2430 + }, + { + "epoch": 2.0478388585816196, + "grad_norm": 0.434097021818161, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 2440 + }, + { + "epoch": 2.0562316407889214, + "grad_norm": 0.4986756145954132, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 2450 + }, + { + "epoch": 2.064624422996223, + "grad_norm": 0.4377020001411438, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 2460 + }, + { + "epoch": 2.073017205203525, + "grad_norm": 0.4412095546722412, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2470 + }, + { + "epoch": 2.0814099874108267, + "grad_norm": 0.4463737905025482, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 2480 + }, + { + "epoch": 2.0898027696181285, + "grad_norm": 0.4118853211402893, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 2490 + }, + { + "epoch": 2.0981955518254303, + "grad_norm": 0.48814308643341064, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 2500 + }, + { + "epoch": 2.106588334032732, + "grad_norm": 0.4263038635253906, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2510 + }, + { + "epoch": 2.1149811162400334, + "grad_norm": 0.41060999035835266, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2520 + }, + { + "epoch": 2.123373898447335, + "grad_norm": 0.4699285626411438, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 2530 + }, + { + "epoch": 2.131766680654637, + "grad_norm": 0.4321298897266388, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2540 + }, + { + "epoch": 2.1401594628619387, + "grad_norm": 0.41544368863105774, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 2550 + }, + { + "epoch": 2.1485522450692405, + "grad_norm": 0.4529191851615906, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2560 + }, + { + "epoch": 2.1569450272765422, + "grad_norm": 0.4370215833187103, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2570 + }, + { + "epoch": 2.165337809483844, + "grad_norm": 0.3878629207611084, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 2580 + }, + { + "epoch": 2.173730591691146, + "grad_norm": 0.47374191880226135, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2590 + }, + { + "epoch": 2.182123373898447, + "grad_norm": 0.4551556706428528, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 2600 + }, + { + "epoch": 2.190516156105749, + "grad_norm": 0.45371633768081665, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 2610 + }, + { + "epoch": 2.1989089383130507, + "grad_norm": 0.3831859529018402, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2620 + }, + { + "epoch": 2.2073017205203525, + "grad_norm": 0.42436569929122925, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2630 + }, + { + "epoch": 2.2156945027276542, + "grad_norm": 0.4363750219345093, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2640 + }, + { + "epoch": 2.224087284934956, + "grad_norm": 0.4473390579223633, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 2650 + }, + { + "epoch": 2.2324800671422578, + "grad_norm": 0.4419533908367157, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 2660 + }, + { + "epoch": 2.2408728493495595, + "grad_norm": 0.525901198387146, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 2670 + }, + { + "epoch": 2.2492656315568613, + "grad_norm": 0.4345211684703827, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 2680 + }, + { + "epoch": 2.2576584137641627, + "grad_norm": 0.5169841051101685, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 2690 + }, + { + "epoch": 2.2660511959714644, + "grad_norm": 0.43511003255844116, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2700 + }, + { + "epoch": 2.274443978178766, + "grad_norm": 0.4781411588191986, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2710 + }, + { + "epoch": 2.282836760386068, + "grad_norm": 0.4282242953777313, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2720 + }, + { + "epoch": 2.2912295425933698, + "grad_norm": 0.4499875605106354, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 2730 + }, + { + "epoch": 2.2996223248006715, + "grad_norm": 0.4133218824863434, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2740 + }, + { + "epoch": 2.3080151070079733, + "grad_norm": 0.4706156849861145, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2750 + }, + { + "epoch": 2.3164078892152746, + "grad_norm": 0.4537484347820282, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.3248006714225764, + "grad_norm": 0.39736735820770264, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2770 + }, + { + "epoch": 2.333193453629878, + "grad_norm": 0.4488453269004822, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2780 + }, + { + "epoch": 2.34158623583718, + "grad_norm": 0.44405487179756165, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2790 + }, + { + "epoch": 2.3499790180444817, + "grad_norm": 0.4726555049419403, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 2800 + }, + { + "epoch": 2.3583718002517835, + "grad_norm": 0.4820375442504883, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2810 + }, + { + "epoch": 2.3667645824590853, + "grad_norm": 0.46176597476005554, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 2820 + }, + { + "epoch": 2.375157364666387, + "grad_norm": 0.4603394567966461, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 2830 + }, + { + "epoch": 2.383550146873689, + "grad_norm": 0.4462946355342865, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 2840 + }, + { + "epoch": 2.39194292908099, + "grad_norm": 0.5216080546379089, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 2850 + }, + { + "epoch": 2.400335711288292, + "grad_norm": 0.44553086161613464, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2860 + }, + { + "epoch": 2.4087284934955937, + "grad_norm": 0.4215725362300873, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2870 + }, + { + "epoch": 2.4171212757028955, + "grad_norm": 0.4646450877189636, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2880 + }, + { + "epoch": 2.4255140579101973, + "grad_norm": 0.44749370217323303, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2890 + }, + { + "epoch": 2.433906840117499, + "grad_norm": 0.4986693859100342, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2900 + }, + { + "epoch": 2.442299622324801, + "grad_norm": 0.4607609808444977, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 2910 + }, + { + "epoch": 2.4506924045321026, + "grad_norm": 0.4597654938697815, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 2920 + }, + { + "epoch": 2.4590851867394043, + "grad_norm": 0.4106820821762085, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 2930 + }, + { + "epoch": 2.4674779689467057, + "grad_norm": 0.4531514048576355, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 2940 + }, + { + "epoch": 2.4758707511540075, + "grad_norm": 0.4546769857406616, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2950 + }, + { + "epoch": 2.4842635333613092, + "grad_norm": 0.47410622239112854, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 2960 + }, + { + "epoch": 2.492656315568611, + "grad_norm": 0.4498177468776703, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 2970 + }, + { + "epoch": 2.5010490977759128, + "grad_norm": 0.47267791628837585, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 2980 + }, + { + "epoch": 2.5094418799832146, + "grad_norm": 0.4340207576751709, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 2990 + }, + { + "epoch": 2.5178346621905163, + "grad_norm": 0.43454936146736145, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3000 + }, + { + "epoch": 2.5262274443978177, + "grad_norm": 0.43459394574165344, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3010 + }, + { + "epoch": 2.5346202266051194, + "grad_norm": 0.4716770052909851, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.543013008812421, + "grad_norm": 0.4339194595813751, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 3030 + }, + { + "epoch": 2.551405791019723, + "grad_norm": 0.4655593931674957, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3040 + }, + { + "epoch": 2.5597985732270248, + "grad_norm": 0.5480475425720215, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3050 + }, + { + "epoch": 2.5681913554343265, + "grad_norm": 0.4783174991607666, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3060 + }, + { + "epoch": 2.5765841376416283, + "grad_norm": 0.45062026381492615, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 3070 + }, + { + "epoch": 2.58497691984893, + "grad_norm": 0.4559392035007477, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 3080 + }, + { + "epoch": 2.593369702056232, + "grad_norm": 0.6581618785858154, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 3090 + }, + { + "epoch": 2.601762484263533, + "grad_norm": 0.48549333214759827, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 3100 + }, + { + "epoch": 2.610155266470835, + "grad_norm": 0.5358436107635498, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 3110 + }, + { + "epoch": 2.6185480486781367, + "grad_norm": 0.5380043983459473, + "learning_rate": 0.0002, + "loss": 1.6507, + "step": 3120 + }, + { + "epoch": 2.6269408308854385, + "grad_norm": 0.49887847900390625, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 3130 + }, + { + "epoch": 2.6353336130927403, + "grad_norm": 0.46039602160453796, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 3140 + }, + { + "epoch": 2.643726395300042, + "grad_norm": 0.416098952293396, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3150 + }, + { + "epoch": 2.652119177507344, + "grad_norm": 0.465326726436615, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 3160 + }, + { + "epoch": 2.660511959714645, + "grad_norm": 0.47029924392700195, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3170 + }, + { + "epoch": 2.6689047419219474, + "grad_norm": 0.5063307285308838, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 3180 + }, + { + "epoch": 2.6772975241292487, + "grad_norm": 0.42928868532180786, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3190 + }, + { + "epoch": 2.6856903063365505, + "grad_norm": 0.4170134365558624, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3200 + }, + { + "epoch": 2.6940830885438523, + "grad_norm": 0.47810474038124084, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3210 + }, + { + "epoch": 2.702475870751154, + "grad_norm": 0.44440609216690063, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 3220 + }, + { + "epoch": 2.710868652958456, + "grad_norm": 0.482759565114975, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 3230 + }, + { + "epoch": 2.7192614351657576, + "grad_norm": 0.4325942099094391, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 3240 + }, + { + "epoch": 2.7276542173730594, + "grad_norm": 0.502498984336853, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 3250 + }, + { + "epoch": 2.7360469995803607, + "grad_norm": 0.4725162982940674, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3260 + }, + { + "epoch": 2.7444397817876625, + "grad_norm": 0.46781349182128906, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 3270 + }, + { + "epoch": 2.7528325639949642, + "grad_norm": 0.47366851568222046, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3280 + }, + { + "epoch": 2.761225346202266, + "grad_norm": 0.5101882815361023, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 3290 + }, + { + "epoch": 2.769618128409568, + "grad_norm": 0.4874587059020996, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 3300 + }, + { + "epoch": 2.7780109106168696, + "grad_norm": 0.4989369213581085, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3310 + }, + { + "epoch": 2.7864036928241713, + "grad_norm": 0.48041442036628723, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 3320 + }, + { + "epoch": 2.7947964750314727, + "grad_norm": 0.4845651090145111, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 3330 + }, + { + "epoch": 2.803189257238775, + "grad_norm": 0.48575496673583984, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 3340 + }, + { + "epoch": 2.811582039446076, + "grad_norm": 0.509726881980896, + "learning_rate": 0.0002, + "loss": 1.6771, + "step": 3350 + }, + { + "epoch": 2.819974821653378, + "grad_norm": 0.5026665329933167, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 3360 + }, + { + "epoch": 2.8283676038606798, + "grad_norm": 0.4727601706981659, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 3370 + }, + { + "epoch": 2.8367603860679815, + "grad_norm": 0.41952234506607056, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 3380 + }, + { + "epoch": 2.8451531682752833, + "grad_norm": 0.49663856625556946, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 3390 + }, + { + "epoch": 2.853545950482585, + "grad_norm": 0.4934511184692383, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 3400 + }, + { + "epoch": 2.861938732689887, + "grad_norm": 0.4673226773738861, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3410 + }, + { + "epoch": 2.870331514897188, + "grad_norm": 0.48972779512405396, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 3420 + }, + { + "epoch": 2.8787242971044904, + "grad_norm": 0.5008330345153809, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3430 + }, + { + "epoch": 2.8871170793117917, + "grad_norm": 0.43337664008140564, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 3440 + }, + { + "epoch": 2.8955098615190935, + "grad_norm": 0.4430622458457947, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 3450 + }, + { + "epoch": 2.9039026437263953, + "grad_norm": 0.45123326778411865, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3460 + }, + { + "epoch": 2.912295425933697, + "grad_norm": 0.47367340326309204, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 3470 + }, + { + "epoch": 2.920688208140999, + "grad_norm": 0.44940701127052307, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3480 + }, + { + "epoch": 2.9290809903483006, + "grad_norm": 0.44216281175613403, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3490 + }, + { + "epoch": 2.9374737725556024, + "grad_norm": 0.4824782609939575, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3500 + }, + { + "epoch": 2.9458665547629037, + "grad_norm": 0.43067067861557007, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 3510 + }, + { + "epoch": 2.9542593369702055, + "grad_norm": 0.46483176946640015, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3520 + }, + { + "epoch": 2.9626521191775073, + "grad_norm": 0.49230799078941345, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3530 + }, + { + "epoch": 2.971044901384809, + "grad_norm": 0.5081011652946472, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3540 + }, + { + "epoch": 2.979437683592111, + "grad_norm": 0.5326072573661804, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 3550 + }, + { + "epoch": 2.9878304657994126, + "grad_norm": 0.4981454014778137, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3560 + }, + { + "epoch": 2.9962232480067144, + "grad_norm": 0.4330528676509857, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3570 + }, + { + "epoch": 2.999580360889635, + "eval_loss": 1.824695348739624, + "eval_runtime": 37.947, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3574 + }, + { + "epoch": 3.004616030214016, + "grad_norm": 0.4380604326725006, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 3580 + }, + { + "epoch": 3.0130088124213175, + "grad_norm": 0.5375564098358154, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 3590 + }, + { + "epoch": 3.0214015946286192, + "grad_norm": 0.50722736120224, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3600 + }, + { + "epoch": 3.029794376835921, + "grad_norm": 0.5398766994476318, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 3610 + }, + { + "epoch": 3.038187159043223, + "grad_norm": 0.520709753036499, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 3620 + }, + { + "epoch": 3.0465799412505246, + "grad_norm": 0.5429664850234985, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 3630 + }, + { + "epoch": 3.0549727234578263, + "grad_norm": 0.5634943842887878, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 3640 + }, + { + "epoch": 3.063365505665128, + "grad_norm": 0.5042277574539185, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 3650 + }, + { + "epoch": 3.07175828787243, + "grad_norm": 0.5778711438179016, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 3660 + }, + { + "epoch": 3.080151070079731, + "grad_norm": 0.5504926443099976, + "learning_rate": 0.0002, + "loss": 1.5196, + "step": 3670 + }, + { + "epoch": 3.088543852287033, + "grad_norm": 0.5199463963508606, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3680 + }, + { + "epoch": 3.0969366344943348, + "grad_norm": 0.552334189414978, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 3690 + }, + { + "epoch": 3.1053294167016365, + "grad_norm": 0.5650873780250549, + "learning_rate": 0.0002, + "loss": 1.4638, + "step": 3700 + }, + { + "epoch": 3.1137221989089383, + "grad_norm": 0.6292349696159363, + "learning_rate": 0.0002, + "loss": 1.4945, + "step": 3710 + }, + { + "epoch": 3.12211498111624, + "grad_norm": 0.5523604154586792, + "learning_rate": 0.0002, + "loss": 1.4787, + "step": 3720 + }, + { + "epoch": 3.130507763323542, + "grad_norm": 0.6160100698471069, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3730 + }, + { + "epoch": 3.1389005455308436, + "grad_norm": 0.6091629266738892, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3740 + }, + { + "epoch": 3.1472933277381454, + "grad_norm": 0.5695531964302063, + "learning_rate": 0.0002, + "loss": 1.4659, + "step": 3750 + }, + { + "epoch": 3.1556861099454467, + "grad_norm": 0.569611132144928, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 3760 + }, + { + "epoch": 3.1640788921527485, + "grad_norm": 0.5761140584945679, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 3770 + }, + { + "epoch": 3.1724716743600503, + "grad_norm": 0.6855548620223999, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 3780 + }, + { + "epoch": 3.180864456567352, + "grad_norm": 0.5815101265907288, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 3790 + }, + { + "epoch": 3.189257238774654, + "grad_norm": 0.6179960370063782, + "learning_rate": 0.0002, + "loss": 1.5289, + "step": 3800 + }, + { + "epoch": 3.1976500209819556, + "grad_norm": 0.5418674349784851, + "learning_rate": 0.0002, + "loss": 1.4833, + "step": 3810 + }, + { + "epoch": 3.2060428031892574, + "grad_norm": 0.5655816197395325, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 3820 + }, + { + "epoch": 3.214435585396559, + "grad_norm": 0.7279291152954102, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 3830 + }, + { + "epoch": 3.2228283676038605, + "grad_norm": 0.490998238325119, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 3840 + }, + { + "epoch": 3.2312211498111623, + "grad_norm": 0.6065797209739685, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 3850 + }, + { + "epoch": 3.239613932018464, + "grad_norm": 0.6024682521820068, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3860 + }, + { + "epoch": 3.248006714225766, + "grad_norm": 0.5571125745773315, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 3870 + }, + { + "epoch": 3.2563994964330676, + "grad_norm": 0.5662134289741516, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3880 + }, + { + "epoch": 3.2647922786403694, + "grad_norm": 0.5936661958694458, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 3890 + }, + { + "epoch": 3.273185060847671, + "grad_norm": 0.6739671230316162, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3900 + }, + { + "epoch": 3.281577843054973, + "grad_norm": 0.5579532384872437, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3910 + }, + { + "epoch": 3.2899706252622742, + "grad_norm": 0.6595954298973083, + "learning_rate": 0.0002, + "loss": 1.4788, + "step": 3920 + }, + { + "epoch": 3.298363407469576, + "grad_norm": 0.5712262988090515, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3930 + }, + { + "epoch": 3.306756189676878, + "grad_norm": 0.5601761341094971, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 3940 + }, + { + "epoch": 3.3151489718841796, + "grad_norm": 0.5759967565536499, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3950 + }, + { + "epoch": 3.3235417540914813, + "grad_norm": 0.6543047428131104, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 3960 + }, + { + "epoch": 3.331934536298783, + "grad_norm": 0.6355253458023071, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 3970 + }, + { + "epoch": 3.340327318506085, + "grad_norm": 0.5671007633209229, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3980 + }, + { + "epoch": 3.3487201007133867, + "grad_norm": 0.6743636727333069, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 3990 + }, + { + "epoch": 3.3571128829206884, + "grad_norm": 0.500627338886261, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 4000 + }, + { + "epoch": 3.3655056651279898, + "grad_norm": 0.5666340589523315, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4010 + }, + { + "epoch": 3.3738984473352915, + "grad_norm": 0.5651408433914185, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 4020 + }, + { + "epoch": 3.3822912295425933, + "grad_norm": 0.6338897943496704, + "learning_rate": 0.0002, + "loss": 1.4907, + "step": 4030 + }, + { + "epoch": 3.390684011749895, + "grad_norm": 0.5781935453414917, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 4040 + }, + { + "epoch": 3.399076793957197, + "grad_norm": 0.55543053150177, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 4050 + }, + { + "epoch": 3.4074695761644986, + "grad_norm": 0.6602614521980286, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4060 + }, + { + "epoch": 3.4158623583718004, + "grad_norm": 0.5514156222343445, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 4070 + }, + { + "epoch": 3.4242551405791017, + "grad_norm": 0.5760560035705566, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 4080 + }, + { + "epoch": 3.4326479227864035, + "grad_norm": 0.657503604888916, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4090 + }, + { + "epoch": 3.4410407049937053, + "grad_norm": 0.5746736526489258, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4100 + }, + { + "epoch": 3.449433487201007, + "grad_norm": 0.5988999009132385, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 4110 + }, + { + "epoch": 3.457826269408309, + "grad_norm": 0.7294586300849915, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4120 + }, + { + "epoch": 3.4662190516156106, + "grad_norm": 0.6391161680221558, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 4130 + }, + { + "epoch": 3.4746118338229124, + "grad_norm": 0.6416470408439636, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4140 + }, + { + "epoch": 3.483004616030214, + "grad_norm": 0.5710626244544983, + "learning_rate": 0.0002, + "loss": 1.5587, + "step": 4150 + }, + { + "epoch": 3.491397398237516, + "grad_norm": 0.5370054841041565, + "learning_rate": 0.0002, + "loss": 1.4661, + "step": 4160 + }, + { + "epoch": 3.4997901804448173, + "grad_norm": 0.5559558272361755, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 4170 + }, + { + "epoch": 3.508182962652119, + "grad_norm": 0.5426168441772461, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 4180 + }, + { + "epoch": 3.516575744859421, + "grad_norm": 0.5997438430786133, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 4190 + }, + { + "epoch": 3.5249685270667226, + "grad_norm": 0.5399143099784851, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4200 + }, + { + "epoch": 3.5333613092740244, + "grad_norm": 0.6341416239738464, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 4210 + }, + { + "epoch": 3.541754091481326, + "grad_norm": 0.632238507270813, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 4220 + }, + { + "epoch": 3.550146873688628, + "grad_norm": 0.6356478333473206, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 4230 + }, + { + "epoch": 3.5585396558959292, + "grad_norm": 0.6379408240318298, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 4240 + }, + { + "epoch": 3.5669324381032315, + "grad_norm": 0.6265586018562317, + "learning_rate": 0.0002, + "loss": 1.5184, + "step": 4250 + }, + { + "epoch": 3.575325220310533, + "grad_norm": 0.5378820896148682, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 4260 + }, + { + "epoch": 3.5837180025178346, + "grad_norm": 0.6800801753997803, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4270 + }, + { + "epoch": 3.5921107847251363, + "grad_norm": 0.5653113126754761, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4280 + }, + { + "epoch": 3.600503566932438, + "grad_norm": 0.548647940158844, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 4290 + }, + { + "epoch": 3.60889634913974, + "grad_norm": 0.5729944705963135, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4300 + }, + { + "epoch": 3.6172891313470417, + "grad_norm": 0.6204999685287476, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 4310 + }, + { + "epoch": 3.6256819135543434, + "grad_norm": 0.6275812983512878, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 4320 + }, + { + "epoch": 3.6340746957616448, + "grad_norm": 0.7261835336685181, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 4330 + }, + { + "epoch": 3.6424674779689465, + "grad_norm": 0.6048004627227783, + "learning_rate": 0.0002, + "loss": 1.5128, + "step": 4340 + }, + { + "epoch": 3.6508602601762483, + "grad_norm": 0.5879671573638916, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 4350 + }, + { + "epoch": 3.65925304238355, + "grad_norm": 0.6001018285751343, + "learning_rate": 0.0002, + "loss": 1.5477, + "step": 4360 + }, + { + "epoch": 3.667645824590852, + "grad_norm": 0.6468151211738586, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4370 + }, + { + "epoch": 3.6760386067981536, + "grad_norm": 0.6342051029205322, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 4380 + }, + { + "epoch": 3.6844313890054554, + "grad_norm": 0.6078384518623352, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 4390 + }, + { + "epoch": 3.692824171212757, + "grad_norm": 0.5555588006973267, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 4400 + }, + { + "epoch": 3.701216953420059, + "grad_norm": 0.6089665293693542, + "learning_rate": 0.0002, + "loss": 1.5694, + "step": 4410 + }, + { + "epoch": 3.7096097356273603, + "grad_norm": 0.6225191950798035, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 4420 + }, + { + "epoch": 3.718002517834662, + "grad_norm": 0.5642715692520142, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 4430 + }, + { + "epoch": 3.726395300041964, + "grad_norm": 0.5703449845314026, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 4440 + }, + { + "epoch": 3.7347880822492656, + "grad_norm": 0.6029745936393738, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 4450 + }, + { + "epoch": 3.7431808644565674, + "grad_norm": 0.7089189887046814, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4460 + }, + { + "epoch": 3.751573646663869, + "grad_norm": 0.6230936050415039, + "learning_rate": 0.0002, + "loss": 1.4804, + "step": 4470 + }, + { + "epoch": 3.759966428871171, + "grad_norm": 0.5718494653701782, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 4480 + }, + { + "epoch": 3.7683592110784723, + "grad_norm": 0.5404117703437805, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 4490 + }, + { + "epoch": 3.7767519932857745, + "grad_norm": 0.5816529393196106, + "learning_rate": 0.0002, + "loss": 1.4707, + "step": 4500 + }, + { + "epoch": 3.785144775493076, + "grad_norm": 0.6314901113510132, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 4510 + }, + { + "epoch": 3.7935375577003776, + "grad_norm": 0.7639698386192322, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 4520 + }, + { + "epoch": 3.8019303399076794, + "grad_norm": 0.5727366209030151, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4530 + }, + { + "epoch": 3.810323122114981, + "grad_norm": 0.6467128396034241, + "learning_rate": 0.0002, + "loss": 1.5409, + "step": 4540 + }, + { + "epoch": 3.818715904322283, + "grad_norm": 0.6572837233543396, + "learning_rate": 0.0002, + "loss": 1.5266, + "step": 4550 + }, + { + "epoch": 3.8271086865295847, + "grad_norm": 0.5847418904304504, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4560 + }, + { + "epoch": 3.8355014687368865, + "grad_norm": 0.48820871114730835, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 4570 + }, + { + "epoch": 3.843894250944188, + "grad_norm": 1.2537429332733154, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 4580 + }, + { + "epoch": 3.8522870331514896, + "grad_norm": 0.6026989221572876, + "learning_rate": 0.0002, + "loss": 1.5522, + "step": 4590 + }, + { + "epoch": 3.8606798153587913, + "grad_norm": 0.5541417598724365, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 4600 + }, + { + "epoch": 3.869072597566093, + "grad_norm": 0.7668771147727966, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 4610 + }, + { + "epoch": 3.877465379773395, + "grad_norm": 0.6181227564811707, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4620 + }, + { + "epoch": 3.8858581619806967, + "grad_norm": 0.5842700004577637, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4630 + }, + { + "epoch": 3.8942509441879984, + "grad_norm": 0.5824751257896423, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 4640 + }, + { + "epoch": 3.9026437263952998, + "grad_norm": 0.6212735772132874, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4650 + }, + { + "epoch": 3.911036508602602, + "grad_norm": 0.6123346090316772, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 4660 + }, + { + "epoch": 3.9194292908099033, + "grad_norm": 0.518662691116333, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4670 + }, + { + "epoch": 3.927822073017205, + "grad_norm": 0.6963476538658142, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4680 + }, + { + "epoch": 3.936214855224507, + "grad_norm": 0.5192152261734009, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4690 + }, + { + "epoch": 3.9446076374318086, + "grad_norm": 0.5820888876914978, + "learning_rate": 0.0002, + "loss": 1.5312, + "step": 4700 + }, + { + "epoch": 3.9530004196391104, + "grad_norm": 0.6320387721061707, + "learning_rate": 0.0002, + "loss": 1.527, + "step": 4710 + }, + { + "epoch": 3.961393201846412, + "grad_norm": 0.6174548268318176, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 4720 + }, + { + "epoch": 3.969785984053714, + "grad_norm": 0.6691966652870178, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 4730 + }, + { + "epoch": 3.9781787662610153, + "grad_norm": 0.5972068309783936, + "learning_rate": 0.0002, + "loss": 1.4762, + "step": 4740 + }, + { + "epoch": 3.9865715484683175, + "grad_norm": 0.5759536027908325, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4750 + }, + { + "epoch": 3.994964330675619, + "grad_norm": 0.5886756777763367, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 4760 + }, + { + "epoch": 4.0, + "eval_loss": 1.8749940395355225, + "eval_runtime": 38.037, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 4766 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.20559808493781e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-4766/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..786c18b400dfbf5735099f21daf5330efe66f5f6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1b04410e3542ec68e8311807f7ce556a30411cc1d4a7e89ab0bb5537ff1755 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..178318cdfaed8c8cc80e8b7a631e290aad241b33 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb77fba62dfc2a5cf0af71316711cf835bdd641ac264c228d18197654766dce8 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4e0000d6dcb8ecb61f4422cfcdc2d4b8b874f8bd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8066332c84a9c7befe00c1559a3c5d2948fa8e661e088d61628294c250d32cdf +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..42c9051d3b0547abd2974e12868c639663cc185c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0f9620558d50cb26bee2c79bea15ffc2c6c6c08705686c3b42cd83fe94fdc50 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f24bfed038b27b03890783194ba2db612387b92a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/trainer_state.json @@ -0,0 +1,4238 @@ +{ + "best_metric": 1.807437539100647, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", + "epoch": 4.9995803608896345, + "eval_steps": 10, + "global_step": 5957, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + }, + { + "epoch": 1.0071338648762065, + "grad_norm": 0.3235187232494354, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1200 + }, + { + "epoch": 1.0155266470835083, + "grad_norm": 0.34884774684906006, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1210 + }, + { + "epoch": 1.0239194292908098, + "grad_norm": 0.3215438425540924, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1220 + }, + { + "epoch": 1.0323122114981116, + "grad_norm": 0.312084823846817, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1230 + }, + { + "epoch": 1.0407049937054134, + "grad_norm": 0.33597758412361145, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1240 + }, + { + "epoch": 1.0490977759127151, + "grad_norm": 0.3421499729156494, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1250 + }, + { + "epoch": 1.0574905581200167, + "grad_norm": 0.3458889126777649, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1260 + }, + { + "epoch": 1.0658833403273185, + "grad_norm": 0.3956579864025116, + "learning_rate": 0.0002, + "loss": 1.6929, + "step": 1270 + }, + { + "epoch": 1.0742761225346202, + "grad_norm": 0.3217819035053253, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1280 + }, + { + "epoch": 1.082668904741922, + "grad_norm": 0.31379663944244385, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1290 + }, + { + "epoch": 1.0910616869492236, + "grad_norm": 0.37231558561325073, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1300 + }, + { + "epoch": 1.0994544691565253, + "grad_norm": 0.35857918858528137, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 1310 + }, + { + "epoch": 1.1078472513638271, + "grad_norm": 0.36637991666793823, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.1162400335711289, + "grad_norm": 0.3436494469642639, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1330 + }, + { + "epoch": 1.1246328157784307, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 1340 + }, + { + "epoch": 1.1330255979857322, + "grad_norm": 0.34587544202804565, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 1350 + }, + { + "epoch": 1.141418380193034, + "grad_norm": 0.35142362117767334, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1360 + }, + { + "epoch": 1.1498111624003358, + "grad_norm": 0.3511804938316345, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1370 + }, + { + "epoch": 1.1582039446076373, + "grad_norm": 0.3549560308456421, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 1380 + }, + { + "epoch": 1.166596726814939, + "grad_norm": 0.35797521471977234, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1390 + }, + { + "epoch": 1.1749895090222409, + "grad_norm": 0.37255269289016724, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 1400 + }, + { + "epoch": 1.1833822912295426, + "grad_norm": 0.3680652379989624, + "learning_rate": 0.0002, + "loss": 1.7274, + "step": 1410 + }, + { + "epoch": 1.1917750734368444, + "grad_norm": 0.400831013917923, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 1420 + }, + { + "epoch": 1.200167855644146, + "grad_norm": 0.39571020007133484, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1430 + }, + { + "epoch": 1.2085606378514477, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 1440 + }, + { + "epoch": 1.2169534200587495, + "grad_norm": 0.3901960551738739, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1450 + }, + { + "epoch": 1.2253462022660513, + "grad_norm": 0.36490726470947266, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1460 + }, + { + "epoch": 1.2337389844733528, + "grad_norm": 0.3739864230155945, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1470 + }, + { + "epoch": 1.2421317666806546, + "grad_norm": 0.39061254262924194, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 1480 + }, + { + "epoch": 1.2505245488879564, + "grad_norm": 0.37198659777641296, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 1490 + }, + { + "epoch": 1.2589173310952582, + "grad_norm": 0.3420586884021759, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1500 + }, + { + "epoch": 1.2673101133025597, + "grad_norm": 0.4094347655773163, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1510 + }, + { + "epoch": 1.2757028955098615, + "grad_norm": 0.38997703790664673, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1520 + }, + { + "epoch": 1.2840956777171633, + "grad_norm": 0.35702022910118103, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 1530 + }, + { + "epoch": 1.292488459924465, + "grad_norm": 0.3892163336277008, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1540 + }, + { + "epoch": 1.3008812421317666, + "grad_norm": 0.33174318075180054, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 1550 + }, + { + "epoch": 1.3092740243390684, + "grad_norm": 0.40701809525489807, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 1560 + }, + { + "epoch": 1.3176668065463701, + "grad_norm": 0.36324232816696167, + "learning_rate": 0.0002, + "loss": 1.7229, + "step": 1570 + }, + { + "epoch": 1.326059588753672, + "grad_norm": 0.3748789429664612, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 1580 + }, + { + "epoch": 1.3344523709609737, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 1590 + }, + { + "epoch": 1.3428451531682752, + "grad_norm": 0.52373206615448, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1600 + }, + { + "epoch": 1.351237935375577, + "grad_norm": 0.40408164262771606, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1610 + }, + { + "epoch": 1.3596307175828788, + "grad_norm": 0.3818126320838928, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 1.3680234997901803, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1630 + }, + { + "epoch": 1.3764162819974821, + "grad_norm": 0.33777865767478943, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 1640 + }, + { + "epoch": 1.384809064204784, + "grad_norm": 0.36344218254089355, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1650 + }, + { + "epoch": 1.3932018464120857, + "grad_norm": 0.3880128562450409, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 1660 + }, + { + "epoch": 1.4015946286193874, + "grad_norm": 0.3906225562095642, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1670 + }, + { + "epoch": 1.409987410826689, + "grad_norm": 0.35857489705085754, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 1680 + }, + { + "epoch": 1.4183801930339908, + "grad_norm": 0.3627418279647827, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1690 + }, + { + "epoch": 1.4267729752412925, + "grad_norm": 0.41963326930999756, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1700 + }, + { + "epoch": 1.435165757448594, + "grad_norm": 0.36280378699302673, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 1.4435585396558959, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.0002, + "loss": 1.7775, + "step": 1720 + }, + { + "epoch": 1.4519513218631976, + "grad_norm": 0.3635849356651306, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 1730 + }, + { + "epoch": 1.4603441040704994, + "grad_norm": 0.4885194003582001, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1740 + }, + { + "epoch": 1.4687368862778012, + "grad_norm": 0.35194680094718933, + "learning_rate": 0.0002, + "loss": 1.6661, + "step": 1750 + }, + { + "epoch": 1.4771296684851027, + "grad_norm": 0.34906691312789917, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1760 + }, + { + "epoch": 1.4855224506924045, + "grad_norm": 0.3994184732437134, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1770 + }, + { + "epoch": 1.4939152328997063, + "grad_norm": 0.3599298298358917, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 1780 + }, + { + "epoch": 1.5023080151070078, + "grad_norm": 0.3794984221458435, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 1790 + }, + { + "epoch": 1.5107007973143096, + "grad_norm": 0.36289724707603455, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 1800 + }, + { + "epoch": 1.5190935795216114, + "grad_norm": 0.38057321310043335, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1810 + }, + { + "epoch": 1.5274863617289132, + "grad_norm": 0.3771969676017761, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 1820 + }, + { + "epoch": 1.535879143936215, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 1830 + }, + { + "epoch": 1.5442719261435167, + "grad_norm": 0.41352227330207825, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 1840 + }, + { + "epoch": 1.5526647083508183, + "grad_norm": 0.35711410641670227, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1850 + }, + { + "epoch": 1.56105749055812, + "grad_norm": 0.40607622265815735, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1860 + }, + { + "epoch": 1.5694502727654216, + "grad_norm": 0.3428550660610199, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 1870 + }, + { + "epoch": 1.5778430549727234, + "grad_norm": 0.3695414066314697, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1880 + }, + { + "epoch": 1.5862358371800251, + "grad_norm": 0.3798272907733917, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1890 + }, + { + "epoch": 1.594628619387327, + "grad_norm": 0.3415829837322235, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1900 + }, + { + "epoch": 1.6030214015946287, + "grad_norm": 0.3575693666934967, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 1910 + }, + { + "epoch": 1.6114141838019305, + "grad_norm": 0.3180370628833771, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1920 + }, + { + "epoch": 1.619806966009232, + "grad_norm": 0.5018689036369324, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1930 + }, + { + "epoch": 1.6281997482165338, + "grad_norm": 0.35676372051239014, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1940 + }, + { + "epoch": 1.6365925304238353, + "grad_norm": 0.3740452229976654, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1950 + }, + { + "epoch": 1.6449853126311371, + "grad_norm": 0.36584731936454773, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 1960 + }, + { + "epoch": 1.653378094838439, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 1970 + }, + { + "epoch": 1.6617708770457407, + "grad_norm": 0.4114968776702881, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 1980 + }, + { + "epoch": 1.6701636592530424, + "grad_norm": 0.3665498197078705, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1990 + }, + { + "epoch": 1.6785564414603442, + "grad_norm": 0.36579379439353943, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 2000 + }, + { + "epoch": 1.6869492236676458, + "grad_norm": 0.3813064694404602, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 2010 + }, + { + "epoch": 1.6953420058749475, + "grad_norm": 0.33390694856643677, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 2020 + }, + { + "epoch": 1.7037347880822493, + "grad_norm": 0.3668614327907562, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 2030 + }, + { + "epoch": 1.7121275702895509, + "grad_norm": 0.352028489112854, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2040 + }, + { + "epoch": 1.7205203524968526, + "grad_norm": 0.33639830350875854, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2050 + }, + { + "epoch": 1.7289131347041544, + "grad_norm": 0.39217695593833923, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 2060 + }, + { + "epoch": 1.7373059169114562, + "grad_norm": 0.42593324184417725, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 2070 + }, + { + "epoch": 1.745698699118758, + "grad_norm": 0.362215518951416, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 2080 + }, + { + "epoch": 1.7540914813260597, + "grad_norm": 0.4087955057621002, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2090 + }, + { + "epoch": 1.7624842635333613, + "grad_norm": 0.35127750039100647, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2100 + }, + { + "epoch": 1.770877045740663, + "grad_norm": 0.33677494525909424, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 2110 + }, + { + "epoch": 1.7792698279479646, + "grad_norm": 0.39616644382476807, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 2120 + }, + { + "epoch": 1.7876626101552664, + "grad_norm": 0.4705100953578949, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 2130 + }, + { + "epoch": 1.7960553923625682, + "grad_norm": 0.3893914818763733, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 2140 + }, + { + "epoch": 1.80444817456987, + "grad_norm": 0.3344813585281372, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 2150 + }, + { + "epoch": 1.8128409567771717, + "grad_norm": 0.36502110958099365, + "learning_rate": 0.0002, + "loss": 1.8329, + "step": 2160 + }, + { + "epoch": 1.8212337389844735, + "grad_norm": 0.3422985374927521, + "learning_rate": 0.0002, + "loss": 1.753, + "step": 2170 + }, + { + "epoch": 1.829626521191775, + "grad_norm": 0.44039851427078247, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2180 + }, + { + "epoch": 1.8380193033990768, + "grad_norm": 0.40052926540374756, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2190 + }, + { + "epoch": 1.8464120856063784, + "grad_norm": 0.3614487648010254, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 2200 + }, + { + "epoch": 1.8548048678136801, + "grad_norm": 0.3800305426120758, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2210 + }, + { + "epoch": 1.863197650020982, + "grad_norm": 0.3942040205001831, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 2220 + }, + { + "epoch": 1.8715904322282837, + "grad_norm": 0.36896875500679016, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 2230 + }, + { + "epoch": 1.8799832144355855, + "grad_norm": 0.3666089177131653, + "learning_rate": 0.0002, + "loss": 1.7371, + "step": 2240 + }, + { + "epoch": 1.8883759966428872, + "grad_norm": 0.3759142756462097, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 2250 + }, + { + "epoch": 1.8967687788501888, + "grad_norm": 0.3711695671081543, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 2260 + }, + { + "epoch": 1.9051615610574906, + "grad_norm": 0.37000006437301636, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2270 + }, + { + "epoch": 1.9135543432647921, + "grad_norm": 0.37376025319099426, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 2280 + }, + { + "epoch": 1.921947125472094, + "grad_norm": 0.3794068694114685, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 2290 + }, + { + "epoch": 1.9303399076793957, + "grad_norm": 0.42530709505081177, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 2300 + }, + { + "epoch": 1.9387326898866974, + "grad_norm": 0.3381672203540802, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 2310 + }, + { + "epoch": 1.9471254720939992, + "grad_norm": 0.3553236722946167, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 2320 + }, + { + "epoch": 1.955518254301301, + "grad_norm": 0.38204774260520935, + "learning_rate": 0.0002, + "loss": 1.715, + "step": 2330 + }, + { + "epoch": 1.9639110365086025, + "grad_norm": 0.4318946301937103, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 2340 + }, + { + "epoch": 1.9723038187159043, + "grad_norm": 0.3563119173049927, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 2350 + }, + { + "epoch": 1.980696600923206, + "grad_norm": 0.362532377243042, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 2360 + }, + { + "epoch": 1.9890893831305076, + "grad_norm": 0.40200483798980713, + "learning_rate": 0.0002, + "loss": 1.6992, + "step": 2370 + }, + { + "epoch": 1.9974821653378094, + "grad_norm": 0.37397003173828125, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.807437539100647, + "eval_runtime": 38.0038, + "eval_samples_per_second": 13.551, + "eval_steps_per_second": 1.71, + "step": 2383 + }, + { + "epoch": 2.005874947545111, + "grad_norm": 0.3563518226146698, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2390 + }, + { + "epoch": 2.014267729752413, + "grad_norm": 0.3913732171058655, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 2400 + }, + { + "epoch": 2.0226605119597147, + "grad_norm": 0.3511047661304474, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 2410 + }, + { + "epoch": 2.0310532941670165, + "grad_norm": 0.3917897641658783, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 2420 + }, + { + "epoch": 2.0394460763743183, + "grad_norm": 0.36766913533210754, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2430 + }, + { + "epoch": 2.0478388585816196, + "grad_norm": 0.434097021818161, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 2440 + }, + { + "epoch": 2.0562316407889214, + "grad_norm": 0.4986756145954132, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 2450 + }, + { + "epoch": 2.064624422996223, + "grad_norm": 0.4377020001411438, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 2460 + }, + { + "epoch": 2.073017205203525, + "grad_norm": 0.4412095546722412, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2470 + }, + { + "epoch": 2.0814099874108267, + "grad_norm": 0.4463737905025482, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 2480 + }, + { + "epoch": 2.0898027696181285, + "grad_norm": 0.4118853211402893, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 2490 + }, + { + "epoch": 2.0981955518254303, + "grad_norm": 0.48814308643341064, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 2500 + }, + { + "epoch": 2.106588334032732, + "grad_norm": 0.4263038635253906, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2510 + }, + { + "epoch": 2.1149811162400334, + "grad_norm": 0.41060999035835266, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2520 + }, + { + "epoch": 2.123373898447335, + "grad_norm": 0.4699285626411438, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 2530 + }, + { + "epoch": 2.131766680654637, + "grad_norm": 0.4321298897266388, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2540 + }, + { + "epoch": 2.1401594628619387, + "grad_norm": 0.41544368863105774, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 2550 + }, + { + "epoch": 2.1485522450692405, + "grad_norm": 0.4529191851615906, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2560 + }, + { + "epoch": 2.1569450272765422, + "grad_norm": 0.4370215833187103, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2570 + }, + { + "epoch": 2.165337809483844, + "grad_norm": 0.3878629207611084, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 2580 + }, + { + "epoch": 2.173730591691146, + "grad_norm": 0.47374191880226135, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2590 + }, + { + "epoch": 2.182123373898447, + "grad_norm": 0.4551556706428528, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 2600 + }, + { + "epoch": 2.190516156105749, + "grad_norm": 0.45371633768081665, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 2610 + }, + { + "epoch": 2.1989089383130507, + "grad_norm": 0.3831859529018402, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2620 + }, + { + "epoch": 2.2073017205203525, + "grad_norm": 0.42436569929122925, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2630 + }, + { + "epoch": 2.2156945027276542, + "grad_norm": 0.4363750219345093, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2640 + }, + { + "epoch": 2.224087284934956, + "grad_norm": 0.4473390579223633, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 2650 + }, + { + "epoch": 2.2324800671422578, + "grad_norm": 0.4419533908367157, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 2660 + }, + { + "epoch": 2.2408728493495595, + "grad_norm": 0.525901198387146, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 2670 + }, + { + "epoch": 2.2492656315568613, + "grad_norm": 0.4345211684703827, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 2680 + }, + { + "epoch": 2.2576584137641627, + "grad_norm": 0.5169841051101685, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 2690 + }, + { + "epoch": 2.2660511959714644, + "grad_norm": 0.43511003255844116, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2700 + }, + { + "epoch": 2.274443978178766, + "grad_norm": 0.4781411588191986, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2710 + }, + { + "epoch": 2.282836760386068, + "grad_norm": 0.4282242953777313, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2720 + }, + { + "epoch": 2.2912295425933698, + "grad_norm": 0.4499875605106354, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 2730 + }, + { + "epoch": 2.2996223248006715, + "grad_norm": 0.4133218824863434, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2740 + }, + { + "epoch": 2.3080151070079733, + "grad_norm": 0.4706156849861145, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2750 + }, + { + "epoch": 2.3164078892152746, + "grad_norm": 0.4537484347820282, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.3248006714225764, + "grad_norm": 0.39736735820770264, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2770 + }, + { + "epoch": 2.333193453629878, + "grad_norm": 0.4488453269004822, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2780 + }, + { + "epoch": 2.34158623583718, + "grad_norm": 0.44405487179756165, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2790 + }, + { + "epoch": 2.3499790180444817, + "grad_norm": 0.4726555049419403, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 2800 + }, + { + "epoch": 2.3583718002517835, + "grad_norm": 0.4820375442504883, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2810 + }, + { + "epoch": 2.3667645824590853, + "grad_norm": 0.46176597476005554, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 2820 + }, + { + "epoch": 2.375157364666387, + "grad_norm": 0.4603394567966461, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 2830 + }, + { + "epoch": 2.383550146873689, + "grad_norm": 0.4462946355342865, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 2840 + }, + { + "epoch": 2.39194292908099, + "grad_norm": 0.5216080546379089, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 2850 + }, + { + "epoch": 2.400335711288292, + "grad_norm": 0.44553086161613464, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2860 + }, + { + "epoch": 2.4087284934955937, + "grad_norm": 0.4215725362300873, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2870 + }, + { + "epoch": 2.4171212757028955, + "grad_norm": 0.4646450877189636, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2880 + }, + { + "epoch": 2.4255140579101973, + "grad_norm": 0.44749370217323303, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2890 + }, + { + "epoch": 2.433906840117499, + "grad_norm": 0.4986693859100342, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2900 + }, + { + "epoch": 2.442299622324801, + "grad_norm": 0.4607609808444977, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 2910 + }, + { + "epoch": 2.4506924045321026, + "grad_norm": 0.4597654938697815, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 2920 + }, + { + "epoch": 2.4590851867394043, + "grad_norm": 0.4106820821762085, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 2930 + }, + { + "epoch": 2.4674779689467057, + "grad_norm": 0.4531514048576355, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 2940 + }, + { + "epoch": 2.4758707511540075, + "grad_norm": 0.4546769857406616, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2950 + }, + { + "epoch": 2.4842635333613092, + "grad_norm": 0.47410622239112854, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 2960 + }, + { + "epoch": 2.492656315568611, + "grad_norm": 0.4498177468776703, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 2970 + }, + { + "epoch": 2.5010490977759128, + "grad_norm": 0.47267791628837585, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 2980 + }, + { + "epoch": 2.5094418799832146, + "grad_norm": 0.4340207576751709, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 2990 + }, + { + "epoch": 2.5178346621905163, + "grad_norm": 0.43454936146736145, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3000 + }, + { + "epoch": 2.5262274443978177, + "grad_norm": 0.43459394574165344, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3010 + }, + { + "epoch": 2.5346202266051194, + "grad_norm": 0.4716770052909851, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.543013008812421, + "grad_norm": 0.4339194595813751, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 3030 + }, + { + "epoch": 2.551405791019723, + "grad_norm": 0.4655593931674957, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3040 + }, + { + "epoch": 2.5597985732270248, + "grad_norm": 0.5480475425720215, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3050 + }, + { + "epoch": 2.5681913554343265, + "grad_norm": 0.4783174991607666, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3060 + }, + { + "epoch": 2.5765841376416283, + "grad_norm": 0.45062026381492615, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 3070 + }, + { + "epoch": 2.58497691984893, + "grad_norm": 0.4559392035007477, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 3080 + }, + { + "epoch": 2.593369702056232, + "grad_norm": 0.6581618785858154, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 3090 + }, + { + "epoch": 2.601762484263533, + "grad_norm": 0.48549333214759827, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 3100 + }, + { + "epoch": 2.610155266470835, + "grad_norm": 0.5358436107635498, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 3110 + }, + { + "epoch": 2.6185480486781367, + "grad_norm": 0.5380043983459473, + "learning_rate": 0.0002, + "loss": 1.6507, + "step": 3120 + }, + { + "epoch": 2.6269408308854385, + "grad_norm": 0.49887847900390625, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 3130 + }, + { + "epoch": 2.6353336130927403, + "grad_norm": 0.46039602160453796, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 3140 + }, + { + "epoch": 2.643726395300042, + "grad_norm": 0.416098952293396, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3150 + }, + { + "epoch": 2.652119177507344, + "grad_norm": 0.465326726436615, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 3160 + }, + { + "epoch": 2.660511959714645, + "grad_norm": 0.47029924392700195, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3170 + }, + { + "epoch": 2.6689047419219474, + "grad_norm": 0.5063307285308838, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 3180 + }, + { + "epoch": 2.6772975241292487, + "grad_norm": 0.42928868532180786, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3190 + }, + { + "epoch": 2.6856903063365505, + "grad_norm": 0.4170134365558624, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3200 + }, + { + "epoch": 2.6940830885438523, + "grad_norm": 0.47810474038124084, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3210 + }, + { + "epoch": 2.702475870751154, + "grad_norm": 0.44440609216690063, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 3220 + }, + { + "epoch": 2.710868652958456, + "grad_norm": 0.482759565114975, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 3230 + }, + { + "epoch": 2.7192614351657576, + "grad_norm": 0.4325942099094391, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 3240 + }, + { + "epoch": 2.7276542173730594, + "grad_norm": 0.502498984336853, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 3250 + }, + { + "epoch": 2.7360469995803607, + "grad_norm": 0.4725162982940674, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3260 + }, + { + "epoch": 2.7444397817876625, + "grad_norm": 0.46781349182128906, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 3270 + }, + { + "epoch": 2.7528325639949642, + "grad_norm": 0.47366851568222046, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3280 + }, + { + "epoch": 2.761225346202266, + "grad_norm": 0.5101882815361023, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 3290 + }, + { + "epoch": 2.769618128409568, + "grad_norm": 0.4874587059020996, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 3300 + }, + { + "epoch": 2.7780109106168696, + "grad_norm": 0.4989369213581085, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3310 + }, + { + "epoch": 2.7864036928241713, + "grad_norm": 0.48041442036628723, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 3320 + }, + { + "epoch": 2.7947964750314727, + "grad_norm": 0.4845651090145111, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 3330 + }, + { + "epoch": 2.803189257238775, + "grad_norm": 0.48575496673583984, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 3340 + }, + { + "epoch": 2.811582039446076, + "grad_norm": 0.509726881980896, + "learning_rate": 0.0002, + "loss": 1.6771, + "step": 3350 + }, + { + "epoch": 2.819974821653378, + "grad_norm": 0.5026665329933167, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 3360 + }, + { + "epoch": 2.8283676038606798, + "grad_norm": 0.4727601706981659, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 3370 + }, + { + "epoch": 2.8367603860679815, + "grad_norm": 0.41952234506607056, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 3380 + }, + { + "epoch": 2.8451531682752833, + "grad_norm": 0.49663856625556946, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 3390 + }, + { + "epoch": 2.853545950482585, + "grad_norm": 0.4934511184692383, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 3400 + }, + { + "epoch": 2.861938732689887, + "grad_norm": 0.4673226773738861, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3410 + }, + { + "epoch": 2.870331514897188, + "grad_norm": 0.48972779512405396, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 3420 + }, + { + "epoch": 2.8787242971044904, + "grad_norm": 0.5008330345153809, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3430 + }, + { + "epoch": 2.8871170793117917, + "grad_norm": 0.43337664008140564, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 3440 + }, + { + "epoch": 2.8955098615190935, + "grad_norm": 0.4430622458457947, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 3450 + }, + { + "epoch": 2.9039026437263953, + "grad_norm": 0.45123326778411865, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3460 + }, + { + "epoch": 2.912295425933697, + "grad_norm": 0.47367340326309204, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 3470 + }, + { + "epoch": 2.920688208140999, + "grad_norm": 0.44940701127052307, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3480 + }, + { + "epoch": 2.9290809903483006, + "grad_norm": 0.44216281175613403, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3490 + }, + { + "epoch": 2.9374737725556024, + "grad_norm": 0.4824782609939575, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3500 + }, + { + "epoch": 2.9458665547629037, + "grad_norm": 0.43067067861557007, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 3510 + }, + { + "epoch": 2.9542593369702055, + "grad_norm": 0.46483176946640015, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3520 + }, + { + "epoch": 2.9626521191775073, + "grad_norm": 0.49230799078941345, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3530 + }, + { + "epoch": 2.971044901384809, + "grad_norm": 0.5081011652946472, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3540 + }, + { + "epoch": 2.979437683592111, + "grad_norm": 0.5326072573661804, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 3550 + }, + { + "epoch": 2.9878304657994126, + "grad_norm": 0.4981454014778137, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3560 + }, + { + "epoch": 2.9962232480067144, + "grad_norm": 0.4330528676509857, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3570 + }, + { + "epoch": 2.999580360889635, + "eval_loss": 1.824695348739624, + "eval_runtime": 37.947, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3574 + }, + { + "epoch": 3.004616030214016, + "grad_norm": 0.4380604326725006, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 3580 + }, + { + "epoch": 3.0130088124213175, + "grad_norm": 0.5375564098358154, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 3590 + }, + { + "epoch": 3.0214015946286192, + "grad_norm": 0.50722736120224, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3600 + }, + { + "epoch": 3.029794376835921, + "grad_norm": 0.5398766994476318, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 3610 + }, + { + "epoch": 3.038187159043223, + "grad_norm": 0.520709753036499, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 3620 + }, + { + "epoch": 3.0465799412505246, + "grad_norm": 0.5429664850234985, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 3630 + }, + { + "epoch": 3.0549727234578263, + "grad_norm": 0.5634943842887878, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 3640 + }, + { + "epoch": 3.063365505665128, + "grad_norm": 0.5042277574539185, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 3650 + }, + { + "epoch": 3.07175828787243, + "grad_norm": 0.5778711438179016, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 3660 + }, + { + "epoch": 3.080151070079731, + "grad_norm": 0.5504926443099976, + "learning_rate": 0.0002, + "loss": 1.5196, + "step": 3670 + }, + { + "epoch": 3.088543852287033, + "grad_norm": 0.5199463963508606, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3680 + }, + { + "epoch": 3.0969366344943348, + "grad_norm": 0.552334189414978, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 3690 + }, + { + "epoch": 3.1053294167016365, + "grad_norm": 0.5650873780250549, + "learning_rate": 0.0002, + "loss": 1.4638, + "step": 3700 + }, + { + "epoch": 3.1137221989089383, + "grad_norm": 0.6292349696159363, + "learning_rate": 0.0002, + "loss": 1.4945, + "step": 3710 + }, + { + "epoch": 3.12211498111624, + "grad_norm": 0.5523604154586792, + "learning_rate": 0.0002, + "loss": 1.4787, + "step": 3720 + }, + { + "epoch": 3.130507763323542, + "grad_norm": 0.6160100698471069, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3730 + }, + { + "epoch": 3.1389005455308436, + "grad_norm": 0.6091629266738892, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3740 + }, + { + "epoch": 3.1472933277381454, + "grad_norm": 0.5695531964302063, + "learning_rate": 0.0002, + "loss": 1.4659, + "step": 3750 + }, + { + "epoch": 3.1556861099454467, + "grad_norm": 0.569611132144928, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 3760 + }, + { + "epoch": 3.1640788921527485, + "grad_norm": 0.5761140584945679, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 3770 + }, + { + "epoch": 3.1724716743600503, + "grad_norm": 0.6855548620223999, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 3780 + }, + { + "epoch": 3.180864456567352, + "grad_norm": 0.5815101265907288, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 3790 + }, + { + "epoch": 3.189257238774654, + "grad_norm": 0.6179960370063782, + "learning_rate": 0.0002, + "loss": 1.5289, + "step": 3800 + }, + { + "epoch": 3.1976500209819556, + "grad_norm": 0.5418674349784851, + "learning_rate": 0.0002, + "loss": 1.4833, + "step": 3810 + }, + { + "epoch": 3.2060428031892574, + "grad_norm": 0.5655816197395325, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 3820 + }, + { + "epoch": 3.214435585396559, + "grad_norm": 0.7279291152954102, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 3830 + }, + { + "epoch": 3.2228283676038605, + "grad_norm": 0.490998238325119, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 3840 + }, + { + "epoch": 3.2312211498111623, + "grad_norm": 0.6065797209739685, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 3850 + }, + { + "epoch": 3.239613932018464, + "grad_norm": 0.6024682521820068, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3860 + }, + { + "epoch": 3.248006714225766, + "grad_norm": 0.5571125745773315, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 3870 + }, + { + "epoch": 3.2563994964330676, + "grad_norm": 0.5662134289741516, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3880 + }, + { + "epoch": 3.2647922786403694, + "grad_norm": 0.5936661958694458, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 3890 + }, + { + "epoch": 3.273185060847671, + "grad_norm": 0.6739671230316162, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3900 + }, + { + "epoch": 3.281577843054973, + "grad_norm": 0.5579532384872437, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3910 + }, + { + "epoch": 3.2899706252622742, + "grad_norm": 0.6595954298973083, + "learning_rate": 0.0002, + "loss": 1.4788, + "step": 3920 + }, + { + "epoch": 3.298363407469576, + "grad_norm": 0.5712262988090515, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3930 + }, + { + "epoch": 3.306756189676878, + "grad_norm": 0.5601761341094971, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 3940 + }, + { + "epoch": 3.3151489718841796, + "grad_norm": 0.5759967565536499, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3950 + }, + { + "epoch": 3.3235417540914813, + "grad_norm": 0.6543047428131104, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 3960 + }, + { + "epoch": 3.331934536298783, + "grad_norm": 0.6355253458023071, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 3970 + }, + { + "epoch": 3.340327318506085, + "grad_norm": 0.5671007633209229, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3980 + }, + { + "epoch": 3.3487201007133867, + "grad_norm": 0.6743636727333069, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 3990 + }, + { + "epoch": 3.3571128829206884, + "grad_norm": 0.500627338886261, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 4000 + }, + { + "epoch": 3.3655056651279898, + "grad_norm": 0.5666340589523315, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4010 + }, + { + "epoch": 3.3738984473352915, + "grad_norm": 0.5651408433914185, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 4020 + }, + { + "epoch": 3.3822912295425933, + "grad_norm": 0.6338897943496704, + "learning_rate": 0.0002, + "loss": 1.4907, + "step": 4030 + }, + { + "epoch": 3.390684011749895, + "grad_norm": 0.5781935453414917, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 4040 + }, + { + "epoch": 3.399076793957197, + "grad_norm": 0.55543053150177, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 4050 + }, + { + "epoch": 3.4074695761644986, + "grad_norm": 0.6602614521980286, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4060 + }, + { + "epoch": 3.4158623583718004, + "grad_norm": 0.5514156222343445, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 4070 + }, + { + "epoch": 3.4242551405791017, + "grad_norm": 0.5760560035705566, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 4080 + }, + { + "epoch": 3.4326479227864035, + "grad_norm": 0.657503604888916, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4090 + }, + { + "epoch": 3.4410407049937053, + "grad_norm": 0.5746736526489258, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4100 + }, + { + "epoch": 3.449433487201007, + "grad_norm": 0.5988999009132385, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 4110 + }, + { + "epoch": 3.457826269408309, + "grad_norm": 0.7294586300849915, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4120 + }, + { + "epoch": 3.4662190516156106, + "grad_norm": 0.6391161680221558, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 4130 + }, + { + "epoch": 3.4746118338229124, + "grad_norm": 0.6416470408439636, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4140 + }, + { + "epoch": 3.483004616030214, + "grad_norm": 0.5710626244544983, + "learning_rate": 0.0002, + "loss": 1.5587, + "step": 4150 + }, + { + "epoch": 3.491397398237516, + "grad_norm": 0.5370054841041565, + "learning_rate": 0.0002, + "loss": 1.4661, + "step": 4160 + }, + { + "epoch": 3.4997901804448173, + "grad_norm": 0.5559558272361755, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 4170 + }, + { + "epoch": 3.508182962652119, + "grad_norm": 0.5426168441772461, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 4180 + }, + { + "epoch": 3.516575744859421, + "grad_norm": 0.5997438430786133, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 4190 + }, + { + "epoch": 3.5249685270667226, + "grad_norm": 0.5399143099784851, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4200 + }, + { + "epoch": 3.5333613092740244, + "grad_norm": 0.6341416239738464, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 4210 + }, + { + "epoch": 3.541754091481326, + "grad_norm": 0.632238507270813, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 4220 + }, + { + "epoch": 3.550146873688628, + "grad_norm": 0.6356478333473206, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 4230 + }, + { + "epoch": 3.5585396558959292, + "grad_norm": 0.6379408240318298, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 4240 + }, + { + "epoch": 3.5669324381032315, + "grad_norm": 0.6265586018562317, + "learning_rate": 0.0002, + "loss": 1.5184, + "step": 4250 + }, + { + "epoch": 3.575325220310533, + "grad_norm": 0.5378820896148682, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 4260 + }, + { + "epoch": 3.5837180025178346, + "grad_norm": 0.6800801753997803, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4270 + }, + { + "epoch": 3.5921107847251363, + "grad_norm": 0.5653113126754761, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4280 + }, + { + "epoch": 3.600503566932438, + "grad_norm": 0.548647940158844, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 4290 + }, + { + "epoch": 3.60889634913974, + "grad_norm": 0.5729944705963135, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4300 + }, + { + "epoch": 3.6172891313470417, + "grad_norm": 0.6204999685287476, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 4310 + }, + { + "epoch": 3.6256819135543434, + "grad_norm": 0.6275812983512878, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 4320 + }, + { + "epoch": 3.6340746957616448, + "grad_norm": 0.7261835336685181, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 4330 + }, + { + "epoch": 3.6424674779689465, + "grad_norm": 0.6048004627227783, + "learning_rate": 0.0002, + "loss": 1.5128, + "step": 4340 + }, + { + "epoch": 3.6508602601762483, + "grad_norm": 0.5879671573638916, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 4350 + }, + { + "epoch": 3.65925304238355, + "grad_norm": 0.6001018285751343, + "learning_rate": 0.0002, + "loss": 1.5477, + "step": 4360 + }, + { + "epoch": 3.667645824590852, + "grad_norm": 0.6468151211738586, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4370 + }, + { + "epoch": 3.6760386067981536, + "grad_norm": 0.6342051029205322, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 4380 + }, + { + "epoch": 3.6844313890054554, + "grad_norm": 0.6078384518623352, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 4390 + }, + { + "epoch": 3.692824171212757, + "grad_norm": 0.5555588006973267, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 4400 + }, + { + "epoch": 3.701216953420059, + "grad_norm": 0.6089665293693542, + "learning_rate": 0.0002, + "loss": 1.5694, + "step": 4410 + }, + { + "epoch": 3.7096097356273603, + "grad_norm": 0.6225191950798035, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 4420 + }, + { + "epoch": 3.718002517834662, + "grad_norm": 0.5642715692520142, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 4430 + }, + { + "epoch": 3.726395300041964, + "grad_norm": 0.5703449845314026, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 4440 + }, + { + "epoch": 3.7347880822492656, + "grad_norm": 0.6029745936393738, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 4450 + }, + { + "epoch": 3.7431808644565674, + "grad_norm": 0.7089189887046814, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4460 + }, + { + "epoch": 3.751573646663869, + "grad_norm": 0.6230936050415039, + "learning_rate": 0.0002, + "loss": 1.4804, + "step": 4470 + }, + { + "epoch": 3.759966428871171, + "grad_norm": 0.5718494653701782, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 4480 + }, + { + "epoch": 3.7683592110784723, + "grad_norm": 0.5404117703437805, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 4490 + }, + { + "epoch": 3.7767519932857745, + "grad_norm": 0.5816529393196106, + "learning_rate": 0.0002, + "loss": 1.4707, + "step": 4500 + }, + { + "epoch": 3.785144775493076, + "grad_norm": 0.6314901113510132, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 4510 + }, + { + "epoch": 3.7935375577003776, + "grad_norm": 0.7639698386192322, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 4520 + }, + { + "epoch": 3.8019303399076794, + "grad_norm": 0.5727366209030151, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4530 + }, + { + "epoch": 3.810323122114981, + "grad_norm": 0.6467128396034241, + "learning_rate": 0.0002, + "loss": 1.5409, + "step": 4540 + }, + { + "epoch": 3.818715904322283, + "grad_norm": 0.6572837233543396, + "learning_rate": 0.0002, + "loss": 1.5266, + "step": 4550 + }, + { + "epoch": 3.8271086865295847, + "grad_norm": 0.5847418904304504, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4560 + }, + { + "epoch": 3.8355014687368865, + "grad_norm": 0.48820871114730835, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 4570 + }, + { + "epoch": 3.843894250944188, + "grad_norm": 1.2537429332733154, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 4580 + }, + { + "epoch": 3.8522870331514896, + "grad_norm": 0.6026989221572876, + "learning_rate": 0.0002, + "loss": 1.5522, + "step": 4590 + }, + { + "epoch": 3.8606798153587913, + "grad_norm": 0.5541417598724365, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 4600 + }, + { + "epoch": 3.869072597566093, + "grad_norm": 0.7668771147727966, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 4610 + }, + { + "epoch": 3.877465379773395, + "grad_norm": 0.6181227564811707, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4620 + }, + { + "epoch": 3.8858581619806967, + "grad_norm": 0.5842700004577637, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4630 + }, + { + "epoch": 3.8942509441879984, + "grad_norm": 0.5824751257896423, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 4640 + }, + { + "epoch": 3.9026437263952998, + "grad_norm": 0.6212735772132874, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4650 + }, + { + "epoch": 3.911036508602602, + "grad_norm": 0.6123346090316772, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 4660 + }, + { + "epoch": 3.9194292908099033, + "grad_norm": 0.518662691116333, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4670 + }, + { + "epoch": 3.927822073017205, + "grad_norm": 0.6963476538658142, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4680 + }, + { + "epoch": 3.936214855224507, + "grad_norm": 0.5192152261734009, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4690 + }, + { + "epoch": 3.9446076374318086, + "grad_norm": 0.5820888876914978, + "learning_rate": 0.0002, + "loss": 1.5312, + "step": 4700 + }, + { + "epoch": 3.9530004196391104, + "grad_norm": 0.6320387721061707, + "learning_rate": 0.0002, + "loss": 1.527, + "step": 4710 + }, + { + "epoch": 3.961393201846412, + "grad_norm": 0.6174548268318176, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 4720 + }, + { + "epoch": 3.969785984053714, + "grad_norm": 0.6691966652870178, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 4730 + }, + { + "epoch": 3.9781787662610153, + "grad_norm": 0.5972068309783936, + "learning_rate": 0.0002, + "loss": 1.4762, + "step": 4740 + }, + { + "epoch": 3.9865715484683175, + "grad_norm": 0.5759536027908325, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4750 + }, + { + "epoch": 3.994964330675619, + "grad_norm": 0.5886756777763367, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 4760 + }, + { + "epoch": 4.0, + "eval_loss": 1.8749940395355225, + "eval_runtime": 38.037, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 4766 + }, + { + "epoch": 4.003357112882921, + "grad_norm": 0.5915011167526245, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 4770 + }, + { + "epoch": 4.011749895090222, + "grad_norm": 0.8565000891685486, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 4780 + }, + { + "epoch": 4.020142677297524, + "grad_norm": 0.7753950953483582, + "learning_rate": 0.0002, + "loss": 1.3211, + "step": 4790 + }, + { + "epoch": 4.028535459504826, + "grad_norm": 0.6837254166603088, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 4800 + }, + { + "epoch": 4.036928241712127, + "grad_norm": 0.8374526500701904, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 4810 + }, + { + "epoch": 4.0453210239194295, + "grad_norm": 0.8717963099479675, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 4820 + }, + { + "epoch": 4.053713806126731, + "grad_norm": 0.7002043724060059, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4830 + }, + { + "epoch": 4.062106588334033, + "grad_norm": 1.0319572687149048, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 4840 + }, + { + "epoch": 4.070499370541334, + "grad_norm": 0.6746882200241089, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 4850 + }, + { + "epoch": 4.078892152748637, + "grad_norm": 0.8187578320503235, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 4860 + }, + { + "epoch": 4.087284934955938, + "grad_norm": 0.7888399362564087, + "learning_rate": 0.0002, + "loss": 1.368, + "step": 4870 + }, + { + "epoch": 4.095677717163239, + "grad_norm": 0.7149351239204407, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4880 + }, + { + "epoch": 4.1040704993705415, + "grad_norm": 0.9067983031272888, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 4890 + }, + { + "epoch": 4.112463281577843, + "grad_norm": 0.771186351776123, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 4900 + }, + { + "epoch": 4.120856063785145, + "grad_norm": 0.7756485342979431, + "learning_rate": 0.0002, + "loss": 1.2722, + "step": 4910 + }, + { + "epoch": 4.129248845992446, + "grad_norm": 0.7149116396903992, + "learning_rate": 0.0002, + "loss": 1.4138, + "step": 4920 + }, + { + "epoch": 4.137641628199749, + "grad_norm": 0.700442910194397, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4930 + }, + { + "epoch": 4.14603441040705, + "grad_norm": 0.8439189195632935, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4940 + }, + { + "epoch": 4.154427192614351, + "grad_norm": 0.6570779085159302, + "learning_rate": 0.0002, + "loss": 1.3511, + "step": 4950 + }, + { + "epoch": 4.1628199748216534, + "grad_norm": 0.886482298374176, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 4960 + }, + { + "epoch": 4.171212757028955, + "grad_norm": 0.7220938801765442, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 4970 + }, + { + "epoch": 4.179605539236257, + "grad_norm": 0.7185905575752258, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 4980 + }, + { + "epoch": 4.187998321443558, + "grad_norm": 0.7566333413124084, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 4990 + }, + { + "epoch": 4.1963911036508605, + "grad_norm": 0.6960445642471313, + "learning_rate": 0.0002, + "loss": 1.2771, + "step": 5000 + }, + { + "epoch": 4.204783885858162, + "grad_norm": 0.7727336883544922, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5010 + }, + { + "epoch": 4.213176668065464, + "grad_norm": 0.8038365244865417, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 5020 + }, + { + "epoch": 4.221569450272765, + "grad_norm": 0.7587628364562988, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5030 + }, + { + "epoch": 4.229962232480067, + "grad_norm": 0.928032398223877, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 5040 + }, + { + "epoch": 4.238355014687369, + "grad_norm": 0.7168642282485962, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5050 + }, + { + "epoch": 4.24674779689467, + "grad_norm": 0.7981422543525696, + "learning_rate": 0.0002, + "loss": 1.3767, + "step": 5060 + }, + { + "epoch": 4.2551405791019725, + "grad_norm": 0.6951150894165039, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5070 + }, + { + "epoch": 4.263533361309274, + "grad_norm": 0.7337371706962585, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 5080 + }, + { + "epoch": 4.271926143516576, + "grad_norm": 0.8367464542388916, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5090 + }, + { + "epoch": 4.280318925723877, + "grad_norm": 0.6744083166122437, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 5100 + }, + { + "epoch": 4.28871170793118, + "grad_norm": 0.9072301387786865, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 5110 + }, + { + "epoch": 4.297104490138481, + "grad_norm": 0.7703930735588074, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 5120 + }, + { + "epoch": 4.305497272345782, + "grad_norm": 0.6734083294868469, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5130 + }, + { + "epoch": 4.3138900545530845, + "grad_norm": 0.7835540175437927, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 5140 + }, + { + "epoch": 4.322282836760386, + "grad_norm": 1.0822200775146484, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 5150 + }, + { + "epoch": 4.330675618967688, + "grad_norm": 0.8432536721229553, + "learning_rate": 0.0002, + "loss": 1.4167, + "step": 5160 + }, + { + "epoch": 4.339068401174989, + "grad_norm": 0.6739283800125122, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 5170 + }, + { + "epoch": 4.347461183382292, + "grad_norm": 0.7395278811454773, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 5180 + }, + { + "epoch": 4.355853965589593, + "grad_norm": 0.7638891339302063, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 5190 + }, + { + "epoch": 4.364246747796894, + "grad_norm": 1.1222662925720215, + "learning_rate": 0.0002, + "loss": 1.34, + "step": 5200 + }, + { + "epoch": 4.3726395300041965, + "grad_norm": 0.9102525115013123, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5210 + }, + { + "epoch": 4.381032312211498, + "grad_norm": 0.7181593775749207, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 5220 + }, + { + "epoch": 4.3894250944188, + "grad_norm": 0.7813979387283325, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 5230 + }, + { + "epoch": 4.397817876626101, + "grad_norm": 0.8906185626983643, + "learning_rate": 0.0002, + "loss": 1.423, + "step": 5240 + }, + { + "epoch": 4.406210658833404, + "grad_norm": 0.7456443309783936, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5250 + }, + { + "epoch": 4.414603441040705, + "grad_norm": 0.8752070069313049, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 5260 + }, + { + "epoch": 4.422996223248007, + "grad_norm": 0.9560954570770264, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 5270 + }, + { + "epoch": 4.4313890054553084, + "grad_norm": 0.7227762341499329, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 5280 + }, + { + "epoch": 4.43978178766261, + "grad_norm": 0.8141599893569946, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 5290 + }, + { + "epoch": 4.448174569869912, + "grad_norm": 0.928382158279419, + "learning_rate": 0.0002, + "loss": 1.381, + "step": 5300 + }, + { + "epoch": 4.456567352077213, + "grad_norm": 0.7719997763633728, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5310 + }, + { + "epoch": 4.4649601342845155, + "grad_norm": 0.8081879615783691, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 5320 + }, + { + "epoch": 4.473352916491817, + "grad_norm": 0.7903412580490112, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 5330 + }, + { + "epoch": 4.481745698699119, + "grad_norm": 0.7751287221908569, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5340 + }, + { + "epoch": 4.49013848090642, + "grad_norm": 0.8287544250488281, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5350 + }, + { + "epoch": 4.498531263113723, + "grad_norm": 0.7431012392044067, + "learning_rate": 0.0002, + "loss": 1.3841, + "step": 5360 + }, + { + "epoch": 4.506924045321024, + "grad_norm": 0.8648661971092224, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5370 + }, + { + "epoch": 4.515316827528325, + "grad_norm": 0.9314997792243958, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5380 + }, + { + "epoch": 4.5237096097356275, + "grad_norm": 0.7530864477157593, + "learning_rate": 0.0002, + "loss": 1.354, + "step": 5390 + }, + { + "epoch": 4.532102391942929, + "grad_norm": 0.8739821910858154, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 5400 + }, + { + "epoch": 4.540495174150231, + "grad_norm": 0.8090344667434692, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 4.548887956357532, + "grad_norm": 0.7530879974365234, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 5420 + }, + { + "epoch": 4.557280738564835, + "grad_norm": 0.8787251114845276, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 5430 + }, + { + "epoch": 4.565673520772136, + "grad_norm": 0.813961923122406, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 5440 + }, + { + "epoch": 4.574066302979437, + "grad_norm": 0.7778232097625732, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 5450 + }, + { + "epoch": 4.5824590851867395, + "grad_norm": 0.7323020696640015, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5460 + }, + { + "epoch": 4.590851867394041, + "grad_norm": 0.7826765179634094, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5470 + }, + { + "epoch": 4.599244649601343, + "grad_norm": 0.7245969772338867, + "learning_rate": 0.0002, + "loss": 1.4068, + "step": 5480 + }, + { + "epoch": 4.607637431808644, + "grad_norm": 0.7697308659553528, + "learning_rate": 0.0002, + "loss": 1.4276, + "step": 5490 + }, + { + "epoch": 4.616030214015947, + "grad_norm": 0.8053571581840515, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5500 + }, + { + "epoch": 4.624422996223248, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 5510 + }, + { + "epoch": 4.632815778430549, + "grad_norm": 0.7398585677146912, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 5520 + }, + { + "epoch": 4.6412085606378515, + "grad_norm": 0.7896319031715393, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 5530 + }, + { + "epoch": 4.649601342845153, + "grad_norm": 0.8290980458259583, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5540 + }, + { + "epoch": 4.657994125052455, + "grad_norm": 0.8232647776603699, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 5550 + }, + { + "epoch": 4.666386907259756, + "grad_norm": 0.9154987335205078, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5560 + }, + { + "epoch": 4.674779689467059, + "grad_norm": 0.8400886654853821, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 5570 + }, + { + "epoch": 4.68317247167436, + "grad_norm": 0.7312718629837036, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 5580 + }, + { + "epoch": 4.691565253881662, + "grad_norm": 0.8043803572654724, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5590 + }, + { + "epoch": 4.6999580360889635, + "grad_norm": 0.7966225147247314, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5600 + }, + { + "epoch": 4.708350818296266, + "grad_norm": 0.881574809551239, + "learning_rate": 0.0002, + "loss": 1.3429, + "step": 5610 + }, + { + "epoch": 4.716743600503567, + "grad_norm": 0.7252084016799927, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5620 + }, + { + "epoch": 4.725136382710868, + "grad_norm": 0.7726518511772156, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 5630 + }, + { + "epoch": 4.7335291649181706, + "grad_norm": 0.7306379079818726, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5640 + }, + { + "epoch": 4.741921947125472, + "grad_norm": 0.8029969334602356, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 5650 + }, + { + "epoch": 4.750314729332774, + "grad_norm": 0.9103893637657166, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 5660 + }, + { + "epoch": 4.758707511540075, + "grad_norm": 0.8783416748046875, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5670 + }, + { + "epoch": 4.767100293747378, + "grad_norm": 0.6807119846343994, + "learning_rate": 0.0002, + "loss": 1.3427, + "step": 5680 + }, + { + "epoch": 4.775493075954679, + "grad_norm": 0.7103772759437561, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 5690 + }, + { + "epoch": 4.78388585816198, + "grad_norm": 0.8472093343734741, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 5700 + }, + { + "epoch": 4.7922786403692825, + "grad_norm": 0.851847231388092, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 5710 + }, + { + "epoch": 4.800671422576584, + "grad_norm": 0.9084636569023132, + "learning_rate": 0.0002, + "loss": 1.3965, + "step": 5720 + }, + { + "epoch": 4.809064204783886, + "grad_norm": 0.7628585696220398, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 5730 + }, + { + "epoch": 4.817456986991187, + "grad_norm": 0.775580883026123, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 5740 + }, + { + "epoch": 4.82584976919849, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 1.4573, + "step": 5750 + }, + { + "epoch": 4.834242551405791, + "grad_norm": 0.7021728754043579, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 5760 + }, + { + "epoch": 4.842635333613092, + "grad_norm": 0.7810541391372681, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5770 + }, + { + "epoch": 4.8510281158203945, + "grad_norm": 0.7290041446685791, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5780 + }, + { + "epoch": 4.859420898027696, + "grad_norm": 0.9059709906578064, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5790 + }, + { + "epoch": 4.867813680234998, + "grad_norm": 0.8338062167167664, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 5800 + }, + { + "epoch": 4.876206462442299, + "grad_norm": 0.830926775932312, + "learning_rate": 0.0002, + "loss": 1.395, + "step": 5810 + }, + { + "epoch": 4.884599244649602, + "grad_norm": 0.7818633317947388, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 5820 + }, + { + "epoch": 4.892992026856903, + "grad_norm": 0.8143376708030701, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5830 + }, + { + "epoch": 4.901384809064205, + "grad_norm": 0.7754496335983276, + "learning_rate": 0.0002, + "loss": 1.3583, + "step": 5840 + }, + { + "epoch": 4.9097775912715065, + "grad_norm": 0.7154468297958374, + "learning_rate": 0.0002, + "loss": 1.4036, + "step": 5850 + }, + { + "epoch": 4.918170373478809, + "grad_norm": 0.6829783916473389, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 5860 + }, + { + "epoch": 4.92656315568611, + "grad_norm": 0.784919261932373, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5870 + }, + { + "epoch": 4.934955937893411, + "grad_norm": 0.8168354034423828, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5880 + }, + { + "epoch": 4.943348720100714, + "grad_norm": 0.7356618642807007, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 5890 + }, + { + "epoch": 4.951741502308015, + "grad_norm": 0.7399224042892456, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 5900 + }, + { + "epoch": 4.960134284515317, + "grad_norm": 0.7430436015129089, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 5910 + }, + { + "epoch": 4.9685270667226185, + "grad_norm": 0.7587705850601196, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 5920 + }, + { + "epoch": 4.976919848929921, + "grad_norm": 0.9103638529777527, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5930 + }, + { + "epoch": 4.985312631137222, + "grad_norm": 0.7357394695281982, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5940 + }, + { + "epoch": 4.993705413344523, + "grad_norm": 0.7371547222137451, + "learning_rate": 0.0002, + "loss": 1.3988, + "step": 5950 + }, + { + "epoch": 4.9995803608896345, + "eval_loss": 1.9367210865020752, + "eval_runtime": 37.9833, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 5957 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.7569976061722624e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-5957/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba2cd6e29bdbd4a71c0213559e131ebdee5649f3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb18a33c7ba6533933d614db599d62ba34dcb6575541e7cf5098afa52a9c581 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5562f558266a0dff43bb8932b36624a0b56935ce --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8a47099fbabc8a2ff40717917f3b536f9a0af0e47171478b9bc9d0d45f8d04 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b78a52d509992043808a313e8e368b90a575432d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:117cdc15bd8687126a959b3b51dd98ff496d8b5212852cf0f05649b0676f9f55 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d6dd3d0f2538d4118af6f983c262e687f7b283 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a2cce07afc43399e52bbc63584aee9af86aba03c5af4593b0a2c4c34260a849 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8e920221bea7ad1101cefe63446fc4bbd37b4f93 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/trainer_state.json @@ -0,0 +1,5079 @@ +{ + "best_metric": 1.807437539100647, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 7149, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + }, + { + "epoch": 1.0071338648762065, + "grad_norm": 0.3235187232494354, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1200 + }, + { + "epoch": 1.0155266470835083, + "grad_norm": 0.34884774684906006, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1210 + }, + { + "epoch": 1.0239194292908098, + "grad_norm": 0.3215438425540924, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1220 + }, + { + "epoch": 1.0323122114981116, + "grad_norm": 0.312084823846817, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1230 + }, + { + "epoch": 1.0407049937054134, + "grad_norm": 0.33597758412361145, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1240 + }, + { + "epoch": 1.0490977759127151, + "grad_norm": 0.3421499729156494, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1250 + }, + { + "epoch": 1.0574905581200167, + "grad_norm": 0.3458889126777649, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1260 + }, + { + "epoch": 1.0658833403273185, + "grad_norm": 0.3956579864025116, + "learning_rate": 0.0002, + "loss": 1.6929, + "step": 1270 + }, + { + "epoch": 1.0742761225346202, + "grad_norm": 0.3217819035053253, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1280 + }, + { + "epoch": 1.082668904741922, + "grad_norm": 0.31379663944244385, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1290 + }, + { + "epoch": 1.0910616869492236, + "grad_norm": 0.37231558561325073, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1300 + }, + { + "epoch": 1.0994544691565253, + "grad_norm": 0.35857918858528137, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 1310 + }, + { + "epoch": 1.1078472513638271, + "grad_norm": 0.36637991666793823, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.1162400335711289, + "grad_norm": 0.3436494469642639, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1330 + }, + { + "epoch": 1.1246328157784307, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 1340 + }, + { + "epoch": 1.1330255979857322, + "grad_norm": 0.34587544202804565, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 1350 + }, + { + "epoch": 1.141418380193034, + "grad_norm": 0.35142362117767334, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1360 + }, + { + "epoch": 1.1498111624003358, + "grad_norm": 0.3511804938316345, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1370 + }, + { + "epoch": 1.1582039446076373, + "grad_norm": 0.3549560308456421, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 1380 + }, + { + "epoch": 1.166596726814939, + "grad_norm": 0.35797521471977234, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1390 + }, + { + "epoch": 1.1749895090222409, + "grad_norm": 0.37255269289016724, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 1400 + }, + { + "epoch": 1.1833822912295426, + "grad_norm": 0.3680652379989624, + "learning_rate": 0.0002, + "loss": 1.7274, + "step": 1410 + }, + { + "epoch": 1.1917750734368444, + "grad_norm": 0.400831013917923, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 1420 + }, + { + "epoch": 1.200167855644146, + "grad_norm": 0.39571020007133484, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1430 + }, + { + "epoch": 1.2085606378514477, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 1440 + }, + { + "epoch": 1.2169534200587495, + "grad_norm": 0.3901960551738739, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1450 + }, + { + "epoch": 1.2253462022660513, + "grad_norm": 0.36490726470947266, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1460 + }, + { + "epoch": 1.2337389844733528, + "grad_norm": 0.3739864230155945, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1470 + }, + { + "epoch": 1.2421317666806546, + "grad_norm": 0.39061254262924194, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 1480 + }, + { + "epoch": 1.2505245488879564, + "grad_norm": 0.37198659777641296, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 1490 + }, + { + "epoch": 1.2589173310952582, + "grad_norm": 0.3420586884021759, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1500 + }, + { + "epoch": 1.2673101133025597, + "grad_norm": 0.4094347655773163, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1510 + }, + { + "epoch": 1.2757028955098615, + "grad_norm": 0.38997703790664673, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1520 + }, + { + "epoch": 1.2840956777171633, + "grad_norm": 0.35702022910118103, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 1530 + }, + { + "epoch": 1.292488459924465, + "grad_norm": 0.3892163336277008, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1540 + }, + { + "epoch": 1.3008812421317666, + "grad_norm": 0.33174318075180054, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 1550 + }, + { + "epoch": 1.3092740243390684, + "grad_norm": 0.40701809525489807, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 1560 + }, + { + "epoch": 1.3176668065463701, + "grad_norm": 0.36324232816696167, + "learning_rate": 0.0002, + "loss": 1.7229, + "step": 1570 + }, + { + "epoch": 1.326059588753672, + "grad_norm": 0.3748789429664612, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 1580 + }, + { + "epoch": 1.3344523709609737, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 1590 + }, + { + "epoch": 1.3428451531682752, + "grad_norm": 0.52373206615448, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1600 + }, + { + "epoch": 1.351237935375577, + "grad_norm": 0.40408164262771606, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1610 + }, + { + "epoch": 1.3596307175828788, + "grad_norm": 0.3818126320838928, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 1.3680234997901803, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1630 + }, + { + "epoch": 1.3764162819974821, + "grad_norm": 0.33777865767478943, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 1640 + }, + { + "epoch": 1.384809064204784, + "grad_norm": 0.36344218254089355, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1650 + }, + { + "epoch": 1.3932018464120857, + "grad_norm": 0.3880128562450409, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 1660 + }, + { + "epoch": 1.4015946286193874, + "grad_norm": 0.3906225562095642, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1670 + }, + { + "epoch": 1.409987410826689, + "grad_norm": 0.35857489705085754, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 1680 + }, + { + "epoch": 1.4183801930339908, + "grad_norm": 0.3627418279647827, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1690 + }, + { + "epoch": 1.4267729752412925, + "grad_norm": 0.41963326930999756, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1700 + }, + { + "epoch": 1.435165757448594, + "grad_norm": 0.36280378699302673, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 1.4435585396558959, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.0002, + "loss": 1.7775, + "step": 1720 + }, + { + "epoch": 1.4519513218631976, + "grad_norm": 0.3635849356651306, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 1730 + }, + { + "epoch": 1.4603441040704994, + "grad_norm": 0.4885194003582001, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1740 + }, + { + "epoch": 1.4687368862778012, + "grad_norm": 0.35194680094718933, + "learning_rate": 0.0002, + "loss": 1.6661, + "step": 1750 + }, + { + "epoch": 1.4771296684851027, + "grad_norm": 0.34906691312789917, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1760 + }, + { + "epoch": 1.4855224506924045, + "grad_norm": 0.3994184732437134, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1770 + }, + { + "epoch": 1.4939152328997063, + "grad_norm": 0.3599298298358917, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 1780 + }, + { + "epoch": 1.5023080151070078, + "grad_norm": 0.3794984221458435, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 1790 + }, + { + "epoch": 1.5107007973143096, + "grad_norm": 0.36289724707603455, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 1800 + }, + { + "epoch": 1.5190935795216114, + "grad_norm": 0.38057321310043335, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1810 + }, + { + "epoch": 1.5274863617289132, + "grad_norm": 0.3771969676017761, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 1820 + }, + { + "epoch": 1.535879143936215, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 1830 + }, + { + "epoch": 1.5442719261435167, + "grad_norm": 0.41352227330207825, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 1840 + }, + { + "epoch": 1.5526647083508183, + "grad_norm": 0.35711410641670227, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1850 + }, + { + "epoch": 1.56105749055812, + "grad_norm": 0.40607622265815735, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1860 + }, + { + "epoch": 1.5694502727654216, + "grad_norm": 0.3428550660610199, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 1870 + }, + { + "epoch": 1.5778430549727234, + "grad_norm": 0.3695414066314697, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1880 + }, + { + "epoch": 1.5862358371800251, + "grad_norm": 0.3798272907733917, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1890 + }, + { + "epoch": 1.594628619387327, + "grad_norm": 0.3415829837322235, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1900 + }, + { + "epoch": 1.6030214015946287, + "grad_norm": 0.3575693666934967, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 1910 + }, + { + "epoch": 1.6114141838019305, + "grad_norm": 0.3180370628833771, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1920 + }, + { + "epoch": 1.619806966009232, + "grad_norm": 0.5018689036369324, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1930 + }, + { + "epoch": 1.6281997482165338, + "grad_norm": 0.35676372051239014, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1940 + }, + { + "epoch": 1.6365925304238353, + "grad_norm": 0.3740452229976654, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1950 + }, + { + "epoch": 1.6449853126311371, + "grad_norm": 0.36584731936454773, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 1960 + }, + { + "epoch": 1.653378094838439, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 1970 + }, + { + "epoch": 1.6617708770457407, + "grad_norm": 0.4114968776702881, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 1980 + }, + { + "epoch": 1.6701636592530424, + "grad_norm": 0.3665498197078705, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1990 + }, + { + "epoch": 1.6785564414603442, + "grad_norm": 0.36579379439353943, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 2000 + }, + { + "epoch": 1.6869492236676458, + "grad_norm": 0.3813064694404602, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 2010 + }, + { + "epoch": 1.6953420058749475, + "grad_norm": 0.33390694856643677, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 2020 + }, + { + "epoch": 1.7037347880822493, + "grad_norm": 0.3668614327907562, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 2030 + }, + { + "epoch": 1.7121275702895509, + "grad_norm": 0.352028489112854, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2040 + }, + { + "epoch": 1.7205203524968526, + "grad_norm": 0.33639830350875854, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2050 + }, + { + "epoch": 1.7289131347041544, + "grad_norm": 0.39217695593833923, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 2060 + }, + { + "epoch": 1.7373059169114562, + "grad_norm": 0.42593324184417725, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 2070 + }, + { + "epoch": 1.745698699118758, + "grad_norm": 0.362215518951416, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 2080 + }, + { + "epoch": 1.7540914813260597, + "grad_norm": 0.4087955057621002, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2090 + }, + { + "epoch": 1.7624842635333613, + "grad_norm": 0.35127750039100647, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2100 + }, + { + "epoch": 1.770877045740663, + "grad_norm": 0.33677494525909424, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 2110 + }, + { + "epoch": 1.7792698279479646, + "grad_norm": 0.39616644382476807, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 2120 + }, + { + "epoch": 1.7876626101552664, + "grad_norm": 0.4705100953578949, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 2130 + }, + { + "epoch": 1.7960553923625682, + "grad_norm": 0.3893914818763733, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 2140 + }, + { + "epoch": 1.80444817456987, + "grad_norm": 0.3344813585281372, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 2150 + }, + { + "epoch": 1.8128409567771717, + "grad_norm": 0.36502110958099365, + "learning_rate": 0.0002, + "loss": 1.8329, + "step": 2160 + }, + { + "epoch": 1.8212337389844735, + "grad_norm": 0.3422985374927521, + "learning_rate": 0.0002, + "loss": 1.753, + "step": 2170 + }, + { + "epoch": 1.829626521191775, + "grad_norm": 0.44039851427078247, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2180 + }, + { + "epoch": 1.8380193033990768, + "grad_norm": 0.40052926540374756, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2190 + }, + { + "epoch": 1.8464120856063784, + "grad_norm": 0.3614487648010254, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 2200 + }, + { + "epoch": 1.8548048678136801, + "grad_norm": 0.3800305426120758, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2210 + }, + { + "epoch": 1.863197650020982, + "grad_norm": 0.3942040205001831, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 2220 + }, + { + "epoch": 1.8715904322282837, + "grad_norm": 0.36896875500679016, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 2230 + }, + { + "epoch": 1.8799832144355855, + "grad_norm": 0.3666089177131653, + "learning_rate": 0.0002, + "loss": 1.7371, + "step": 2240 + }, + { + "epoch": 1.8883759966428872, + "grad_norm": 0.3759142756462097, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 2250 + }, + { + "epoch": 1.8967687788501888, + "grad_norm": 0.3711695671081543, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 2260 + }, + { + "epoch": 1.9051615610574906, + "grad_norm": 0.37000006437301636, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2270 + }, + { + "epoch": 1.9135543432647921, + "grad_norm": 0.37376025319099426, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 2280 + }, + { + "epoch": 1.921947125472094, + "grad_norm": 0.3794068694114685, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 2290 + }, + { + "epoch": 1.9303399076793957, + "grad_norm": 0.42530709505081177, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 2300 + }, + { + "epoch": 1.9387326898866974, + "grad_norm": 0.3381672203540802, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 2310 + }, + { + "epoch": 1.9471254720939992, + "grad_norm": 0.3553236722946167, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 2320 + }, + { + "epoch": 1.955518254301301, + "grad_norm": 0.38204774260520935, + "learning_rate": 0.0002, + "loss": 1.715, + "step": 2330 + }, + { + "epoch": 1.9639110365086025, + "grad_norm": 0.4318946301937103, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 2340 + }, + { + "epoch": 1.9723038187159043, + "grad_norm": 0.3563119173049927, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 2350 + }, + { + "epoch": 1.980696600923206, + "grad_norm": 0.362532377243042, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 2360 + }, + { + "epoch": 1.9890893831305076, + "grad_norm": 0.40200483798980713, + "learning_rate": 0.0002, + "loss": 1.6992, + "step": 2370 + }, + { + "epoch": 1.9974821653378094, + "grad_norm": 0.37397003173828125, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.807437539100647, + "eval_runtime": 38.0038, + "eval_samples_per_second": 13.551, + "eval_steps_per_second": 1.71, + "step": 2383 + }, + { + "epoch": 2.005874947545111, + "grad_norm": 0.3563518226146698, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2390 + }, + { + "epoch": 2.014267729752413, + "grad_norm": 0.3913732171058655, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 2400 + }, + { + "epoch": 2.0226605119597147, + "grad_norm": 0.3511047661304474, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 2410 + }, + { + "epoch": 2.0310532941670165, + "grad_norm": 0.3917897641658783, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 2420 + }, + { + "epoch": 2.0394460763743183, + "grad_norm": 0.36766913533210754, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2430 + }, + { + "epoch": 2.0478388585816196, + "grad_norm": 0.434097021818161, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 2440 + }, + { + "epoch": 2.0562316407889214, + "grad_norm": 0.4986756145954132, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 2450 + }, + { + "epoch": 2.064624422996223, + "grad_norm": 0.4377020001411438, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 2460 + }, + { + "epoch": 2.073017205203525, + "grad_norm": 0.4412095546722412, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2470 + }, + { + "epoch": 2.0814099874108267, + "grad_norm": 0.4463737905025482, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 2480 + }, + { + "epoch": 2.0898027696181285, + "grad_norm": 0.4118853211402893, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 2490 + }, + { + "epoch": 2.0981955518254303, + "grad_norm": 0.48814308643341064, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 2500 + }, + { + "epoch": 2.106588334032732, + "grad_norm": 0.4263038635253906, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2510 + }, + { + "epoch": 2.1149811162400334, + "grad_norm": 0.41060999035835266, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2520 + }, + { + "epoch": 2.123373898447335, + "grad_norm": 0.4699285626411438, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 2530 + }, + { + "epoch": 2.131766680654637, + "grad_norm": 0.4321298897266388, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2540 + }, + { + "epoch": 2.1401594628619387, + "grad_norm": 0.41544368863105774, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 2550 + }, + { + "epoch": 2.1485522450692405, + "grad_norm": 0.4529191851615906, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2560 + }, + { + "epoch": 2.1569450272765422, + "grad_norm": 0.4370215833187103, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2570 + }, + { + "epoch": 2.165337809483844, + "grad_norm": 0.3878629207611084, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 2580 + }, + { + "epoch": 2.173730591691146, + "grad_norm": 0.47374191880226135, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2590 + }, + { + "epoch": 2.182123373898447, + "grad_norm": 0.4551556706428528, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 2600 + }, + { + "epoch": 2.190516156105749, + "grad_norm": 0.45371633768081665, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 2610 + }, + { + "epoch": 2.1989089383130507, + "grad_norm": 0.3831859529018402, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2620 + }, + { + "epoch": 2.2073017205203525, + "grad_norm": 0.42436569929122925, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2630 + }, + { + "epoch": 2.2156945027276542, + "grad_norm": 0.4363750219345093, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2640 + }, + { + "epoch": 2.224087284934956, + "grad_norm": 0.4473390579223633, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 2650 + }, + { + "epoch": 2.2324800671422578, + "grad_norm": 0.4419533908367157, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 2660 + }, + { + "epoch": 2.2408728493495595, + "grad_norm": 0.525901198387146, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 2670 + }, + { + "epoch": 2.2492656315568613, + "grad_norm": 0.4345211684703827, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 2680 + }, + { + "epoch": 2.2576584137641627, + "grad_norm": 0.5169841051101685, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 2690 + }, + { + "epoch": 2.2660511959714644, + "grad_norm": 0.43511003255844116, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2700 + }, + { + "epoch": 2.274443978178766, + "grad_norm": 0.4781411588191986, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2710 + }, + { + "epoch": 2.282836760386068, + "grad_norm": 0.4282242953777313, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2720 + }, + { + "epoch": 2.2912295425933698, + "grad_norm": 0.4499875605106354, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 2730 + }, + { + "epoch": 2.2996223248006715, + "grad_norm": 0.4133218824863434, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2740 + }, + { + "epoch": 2.3080151070079733, + "grad_norm": 0.4706156849861145, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2750 + }, + { + "epoch": 2.3164078892152746, + "grad_norm": 0.4537484347820282, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.3248006714225764, + "grad_norm": 0.39736735820770264, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2770 + }, + { + "epoch": 2.333193453629878, + "grad_norm": 0.4488453269004822, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2780 + }, + { + "epoch": 2.34158623583718, + "grad_norm": 0.44405487179756165, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2790 + }, + { + "epoch": 2.3499790180444817, + "grad_norm": 0.4726555049419403, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 2800 + }, + { + "epoch": 2.3583718002517835, + "grad_norm": 0.4820375442504883, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2810 + }, + { + "epoch": 2.3667645824590853, + "grad_norm": 0.46176597476005554, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 2820 + }, + { + "epoch": 2.375157364666387, + "grad_norm": 0.4603394567966461, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 2830 + }, + { + "epoch": 2.383550146873689, + "grad_norm": 0.4462946355342865, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 2840 + }, + { + "epoch": 2.39194292908099, + "grad_norm": 0.5216080546379089, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 2850 + }, + { + "epoch": 2.400335711288292, + "grad_norm": 0.44553086161613464, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2860 + }, + { + "epoch": 2.4087284934955937, + "grad_norm": 0.4215725362300873, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2870 + }, + { + "epoch": 2.4171212757028955, + "grad_norm": 0.4646450877189636, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2880 + }, + { + "epoch": 2.4255140579101973, + "grad_norm": 0.44749370217323303, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2890 + }, + { + "epoch": 2.433906840117499, + "grad_norm": 0.4986693859100342, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2900 + }, + { + "epoch": 2.442299622324801, + "grad_norm": 0.4607609808444977, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 2910 + }, + { + "epoch": 2.4506924045321026, + "grad_norm": 0.4597654938697815, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 2920 + }, + { + "epoch": 2.4590851867394043, + "grad_norm": 0.4106820821762085, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 2930 + }, + { + "epoch": 2.4674779689467057, + "grad_norm": 0.4531514048576355, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 2940 + }, + { + "epoch": 2.4758707511540075, + "grad_norm": 0.4546769857406616, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2950 + }, + { + "epoch": 2.4842635333613092, + "grad_norm": 0.47410622239112854, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 2960 + }, + { + "epoch": 2.492656315568611, + "grad_norm": 0.4498177468776703, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 2970 + }, + { + "epoch": 2.5010490977759128, + "grad_norm": 0.47267791628837585, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 2980 + }, + { + "epoch": 2.5094418799832146, + "grad_norm": 0.4340207576751709, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 2990 + }, + { + "epoch": 2.5178346621905163, + "grad_norm": 0.43454936146736145, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3000 + }, + { + "epoch": 2.5262274443978177, + "grad_norm": 0.43459394574165344, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3010 + }, + { + "epoch": 2.5346202266051194, + "grad_norm": 0.4716770052909851, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.543013008812421, + "grad_norm": 0.4339194595813751, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 3030 + }, + { + "epoch": 2.551405791019723, + "grad_norm": 0.4655593931674957, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3040 + }, + { + "epoch": 2.5597985732270248, + "grad_norm": 0.5480475425720215, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3050 + }, + { + "epoch": 2.5681913554343265, + "grad_norm": 0.4783174991607666, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3060 + }, + { + "epoch": 2.5765841376416283, + "grad_norm": 0.45062026381492615, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 3070 + }, + { + "epoch": 2.58497691984893, + "grad_norm": 0.4559392035007477, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 3080 + }, + { + "epoch": 2.593369702056232, + "grad_norm": 0.6581618785858154, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 3090 + }, + { + "epoch": 2.601762484263533, + "grad_norm": 0.48549333214759827, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 3100 + }, + { + "epoch": 2.610155266470835, + "grad_norm": 0.5358436107635498, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 3110 + }, + { + "epoch": 2.6185480486781367, + "grad_norm": 0.5380043983459473, + "learning_rate": 0.0002, + "loss": 1.6507, + "step": 3120 + }, + { + "epoch": 2.6269408308854385, + "grad_norm": 0.49887847900390625, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 3130 + }, + { + "epoch": 2.6353336130927403, + "grad_norm": 0.46039602160453796, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 3140 + }, + { + "epoch": 2.643726395300042, + "grad_norm": 0.416098952293396, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3150 + }, + { + "epoch": 2.652119177507344, + "grad_norm": 0.465326726436615, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 3160 + }, + { + "epoch": 2.660511959714645, + "grad_norm": 0.47029924392700195, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3170 + }, + { + "epoch": 2.6689047419219474, + "grad_norm": 0.5063307285308838, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 3180 + }, + { + "epoch": 2.6772975241292487, + "grad_norm": 0.42928868532180786, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3190 + }, + { + "epoch": 2.6856903063365505, + "grad_norm": 0.4170134365558624, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3200 + }, + { + "epoch": 2.6940830885438523, + "grad_norm": 0.47810474038124084, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3210 + }, + { + "epoch": 2.702475870751154, + "grad_norm": 0.44440609216690063, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 3220 + }, + { + "epoch": 2.710868652958456, + "grad_norm": 0.482759565114975, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 3230 + }, + { + "epoch": 2.7192614351657576, + "grad_norm": 0.4325942099094391, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 3240 + }, + { + "epoch": 2.7276542173730594, + "grad_norm": 0.502498984336853, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 3250 + }, + { + "epoch": 2.7360469995803607, + "grad_norm": 0.4725162982940674, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3260 + }, + { + "epoch": 2.7444397817876625, + "grad_norm": 0.46781349182128906, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 3270 + }, + { + "epoch": 2.7528325639949642, + "grad_norm": 0.47366851568222046, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3280 + }, + { + "epoch": 2.761225346202266, + "grad_norm": 0.5101882815361023, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 3290 + }, + { + "epoch": 2.769618128409568, + "grad_norm": 0.4874587059020996, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 3300 + }, + { + "epoch": 2.7780109106168696, + "grad_norm": 0.4989369213581085, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3310 + }, + { + "epoch": 2.7864036928241713, + "grad_norm": 0.48041442036628723, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 3320 + }, + { + "epoch": 2.7947964750314727, + "grad_norm": 0.4845651090145111, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 3330 + }, + { + "epoch": 2.803189257238775, + "grad_norm": 0.48575496673583984, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 3340 + }, + { + "epoch": 2.811582039446076, + "grad_norm": 0.509726881980896, + "learning_rate": 0.0002, + "loss": 1.6771, + "step": 3350 + }, + { + "epoch": 2.819974821653378, + "grad_norm": 0.5026665329933167, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 3360 + }, + { + "epoch": 2.8283676038606798, + "grad_norm": 0.4727601706981659, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 3370 + }, + { + "epoch": 2.8367603860679815, + "grad_norm": 0.41952234506607056, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 3380 + }, + { + "epoch": 2.8451531682752833, + "grad_norm": 0.49663856625556946, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 3390 + }, + { + "epoch": 2.853545950482585, + "grad_norm": 0.4934511184692383, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 3400 + }, + { + "epoch": 2.861938732689887, + "grad_norm": 0.4673226773738861, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3410 + }, + { + "epoch": 2.870331514897188, + "grad_norm": 0.48972779512405396, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 3420 + }, + { + "epoch": 2.8787242971044904, + "grad_norm": 0.5008330345153809, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3430 + }, + { + "epoch": 2.8871170793117917, + "grad_norm": 0.43337664008140564, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 3440 + }, + { + "epoch": 2.8955098615190935, + "grad_norm": 0.4430622458457947, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 3450 + }, + { + "epoch": 2.9039026437263953, + "grad_norm": 0.45123326778411865, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3460 + }, + { + "epoch": 2.912295425933697, + "grad_norm": 0.47367340326309204, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 3470 + }, + { + "epoch": 2.920688208140999, + "grad_norm": 0.44940701127052307, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3480 + }, + { + "epoch": 2.9290809903483006, + "grad_norm": 0.44216281175613403, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3490 + }, + { + "epoch": 2.9374737725556024, + "grad_norm": 0.4824782609939575, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3500 + }, + { + "epoch": 2.9458665547629037, + "grad_norm": 0.43067067861557007, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 3510 + }, + { + "epoch": 2.9542593369702055, + "grad_norm": 0.46483176946640015, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3520 + }, + { + "epoch": 2.9626521191775073, + "grad_norm": 0.49230799078941345, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3530 + }, + { + "epoch": 2.971044901384809, + "grad_norm": 0.5081011652946472, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3540 + }, + { + "epoch": 2.979437683592111, + "grad_norm": 0.5326072573661804, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 3550 + }, + { + "epoch": 2.9878304657994126, + "grad_norm": 0.4981454014778137, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3560 + }, + { + "epoch": 2.9962232480067144, + "grad_norm": 0.4330528676509857, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3570 + }, + { + "epoch": 2.999580360889635, + "eval_loss": 1.824695348739624, + "eval_runtime": 37.947, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3574 + }, + { + "epoch": 3.004616030214016, + "grad_norm": 0.4380604326725006, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 3580 + }, + { + "epoch": 3.0130088124213175, + "grad_norm": 0.5375564098358154, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 3590 + }, + { + "epoch": 3.0214015946286192, + "grad_norm": 0.50722736120224, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3600 + }, + { + "epoch": 3.029794376835921, + "grad_norm": 0.5398766994476318, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 3610 + }, + { + "epoch": 3.038187159043223, + "grad_norm": 0.520709753036499, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 3620 + }, + { + "epoch": 3.0465799412505246, + "grad_norm": 0.5429664850234985, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 3630 + }, + { + "epoch": 3.0549727234578263, + "grad_norm": 0.5634943842887878, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 3640 + }, + { + "epoch": 3.063365505665128, + "grad_norm": 0.5042277574539185, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 3650 + }, + { + "epoch": 3.07175828787243, + "grad_norm": 0.5778711438179016, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 3660 + }, + { + "epoch": 3.080151070079731, + "grad_norm": 0.5504926443099976, + "learning_rate": 0.0002, + "loss": 1.5196, + "step": 3670 + }, + { + "epoch": 3.088543852287033, + "grad_norm": 0.5199463963508606, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3680 + }, + { + "epoch": 3.0969366344943348, + "grad_norm": 0.552334189414978, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 3690 + }, + { + "epoch": 3.1053294167016365, + "grad_norm": 0.5650873780250549, + "learning_rate": 0.0002, + "loss": 1.4638, + "step": 3700 + }, + { + "epoch": 3.1137221989089383, + "grad_norm": 0.6292349696159363, + "learning_rate": 0.0002, + "loss": 1.4945, + "step": 3710 + }, + { + "epoch": 3.12211498111624, + "grad_norm": 0.5523604154586792, + "learning_rate": 0.0002, + "loss": 1.4787, + "step": 3720 + }, + { + "epoch": 3.130507763323542, + "grad_norm": 0.6160100698471069, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3730 + }, + { + "epoch": 3.1389005455308436, + "grad_norm": 0.6091629266738892, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3740 + }, + { + "epoch": 3.1472933277381454, + "grad_norm": 0.5695531964302063, + "learning_rate": 0.0002, + "loss": 1.4659, + "step": 3750 + }, + { + "epoch": 3.1556861099454467, + "grad_norm": 0.569611132144928, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 3760 + }, + { + "epoch": 3.1640788921527485, + "grad_norm": 0.5761140584945679, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 3770 + }, + { + "epoch": 3.1724716743600503, + "grad_norm": 0.6855548620223999, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 3780 + }, + { + "epoch": 3.180864456567352, + "grad_norm": 0.5815101265907288, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 3790 + }, + { + "epoch": 3.189257238774654, + "grad_norm": 0.6179960370063782, + "learning_rate": 0.0002, + "loss": 1.5289, + "step": 3800 + }, + { + "epoch": 3.1976500209819556, + "grad_norm": 0.5418674349784851, + "learning_rate": 0.0002, + "loss": 1.4833, + "step": 3810 + }, + { + "epoch": 3.2060428031892574, + "grad_norm": 0.5655816197395325, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 3820 + }, + { + "epoch": 3.214435585396559, + "grad_norm": 0.7279291152954102, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 3830 + }, + { + "epoch": 3.2228283676038605, + "grad_norm": 0.490998238325119, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 3840 + }, + { + "epoch": 3.2312211498111623, + "grad_norm": 0.6065797209739685, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 3850 + }, + { + "epoch": 3.239613932018464, + "grad_norm": 0.6024682521820068, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3860 + }, + { + "epoch": 3.248006714225766, + "grad_norm": 0.5571125745773315, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 3870 + }, + { + "epoch": 3.2563994964330676, + "grad_norm": 0.5662134289741516, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3880 + }, + { + "epoch": 3.2647922786403694, + "grad_norm": 0.5936661958694458, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 3890 + }, + { + "epoch": 3.273185060847671, + "grad_norm": 0.6739671230316162, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3900 + }, + { + "epoch": 3.281577843054973, + "grad_norm": 0.5579532384872437, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3910 + }, + { + "epoch": 3.2899706252622742, + "grad_norm": 0.6595954298973083, + "learning_rate": 0.0002, + "loss": 1.4788, + "step": 3920 + }, + { + "epoch": 3.298363407469576, + "grad_norm": 0.5712262988090515, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3930 + }, + { + "epoch": 3.306756189676878, + "grad_norm": 0.5601761341094971, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 3940 + }, + { + "epoch": 3.3151489718841796, + "grad_norm": 0.5759967565536499, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3950 + }, + { + "epoch": 3.3235417540914813, + "grad_norm": 0.6543047428131104, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 3960 + }, + { + "epoch": 3.331934536298783, + "grad_norm": 0.6355253458023071, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 3970 + }, + { + "epoch": 3.340327318506085, + "grad_norm": 0.5671007633209229, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3980 + }, + { + "epoch": 3.3487201007133867, + "grad_norm": 0.6743636727333069, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 3990 + }, + { + "epoch": 3.3571128829206884, + "grad_norm": 0.500627338886261, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 4000 + }, + { + "epoch": 3.3655056651279898, + "grad_norm": 0.5666340589523315, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4010 + }, + { + "epoch": 3.3738984473352915, + "grad_norm": 0.5651408433914185, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 4020 + }, + { + "epoch": 3.3822912295425933, + "grad_norm": 0.6338897943496704, + "learning_rate": 0.0002, + "loss": 1.4907, + "step": 4030 + }, + { + "epoch": 3.390684011749895, + "grad_norm": 0.5781935453414917, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 4040 + }, + { + "epoch": 3.399076793957197, + "grad_norm": 0.55543053150177, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 4050 + }, + { + "epoch": 3.4074695761644986, + "grad_norm": 0.6602614521980286, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4060 + }, + { + "epoch": 3.4158623583718004, + "grad_norm": 0.5514156222343445, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 4070 + }, + { + "epoch": 3.4242551405791017, + "grad_norm": 0.5760560035705566, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 4080 + }, + { + "epoch": 3.4326479227864035, + "grad_norm": 0.657503604888916, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4090 + }, + { + "epoch": 3.4410407049937053, + "grad_norm": 0.5746736526489258, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4100 + }, + { + "epoch": 3.449433487201007, + "grad_norm": 0.5988999009132385, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 4110 + }, + { + "epoch": 3.457826269408309, + "grad_norm": 0.7294586300849915, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4120 + }, + { + "epoch": 3.4662190516156106, + "grad_norm": 0.6391161680221558, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 4130 + }, + { + "epoch": 3.4746118338229124, + "grad_norm": 0.6416470408439636, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4140 + }, + { + "epoch": 3.483004616030214, + "grad_norm": 0.5710626244544983, + "learning_rate": 0.0002, + "loss": 1.5587, + "step": 4150 + }, + { + "epoch": 3.491397398237516, + "grad_norm": 0.5370054841041565, + "learning_rate": 0.0002, + "loss": 1.4661, + "step": 4160 + }, + { + "epoch": 3.4997901804448173, + "grad_norm": 0.5559558272361755, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 4170 + }, + { + "epoch": 3.508182962652119, + "grad_norm": 0.5426168441772461, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 4180 + }, + { + "epoch": 3.516575744859421, + "grad_norm": 0.5997438430786133, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 4190 + }, + { + "epoch": 3.5249685270667226, + "grad_norm": 0.5399143099784851, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4200 + }, + { + "epoch": 3.5333613092740244, + "grad_norm": 0.6341416239738464, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 4210 + }, + { + "epoch": 3.541754091481326, + "grad_norm": 0.632238507270813, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 4220 + }, + { + "epoch": 3.550146873688628, + "grad_norm": 0.6356478333473206, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 4230 + }, + { + "epoch": 3.5585396558959292, + "grad_norm": 0.6379408240318298, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 4240 + }, + { + "epoch": 3.5669324381032315, + "grad_norm": 0.6265586018562317, + "learning_rate": 0.0002, + "loss": 1.5184, + "step": 4250 + }, + { + "epoch": 3.575325220310533, + "grad_norm": 0.5378820896148682, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 4260 + }, + { + "epoch": 3.5837180025178346, + "grad_norm": 0.6800801753997803, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4270 + }, + { + "epoch": 3.5921107847251363, + "grad_norm": 0.5653113126754761, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4280 + }, + { + "epoch": 3.600503566932438, + "grad_norm": 0.548647940158844, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 4290 + }, + { + "epoch": 3.60889634913974, + "grad_norm": 0.5729944705963135, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4300 + }, + { + "epoch": 3.6172891313470417, + "grad_norm": 0.6204999685287476, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 4310 + }, + { + "epoch": 3.6256819135543434, + "grad_norm": 0.6275812983512878, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 4320 + }, + { + "epoch": 3.6340746957616448, + "grad_norm": 0.7261835336685181, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 4330 + }, + { + "epoch": 3.6424674779689465, + "grad_norm": 0.6048004627227783, + "learning_rate": 0.0002, + "loss": 1.5128, + "step": 4340 + }, + { + "epoch": 3.6508602601762483, + "grad_norm": 0.5879671573638916, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 4350 + }, + { + "epoch": 3.65925304238355, + "grad_norm": 0.6001018285751343, + "learning_rate": 0.0002, + "loss": 1.5477, + "step": 4360 + }, + { + "epoch": 3.667645824590852, + "grad_norm": 0.6468151211738586, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4370 + }, + { + "epoch": 3.6760386067981536, + "grad_norm": 0.6342051029205322, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 4380 + }, + { + "epoch": 3.6844313890054554, + "grad_norm": 0.6078384518623352, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 4390 + }, + { + "epoch": 3.692824171212757, + "grad_norm": 0.5555588006973267, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 4400 + }, + { + "epoch": 3.701216953420059, + "grad_norm": 0.6089665293693542, + "learning_rate": 0.0002, + "loss": 1.5694, + "step": 4410 + }, + { + "epoch": 3.7096097356273603, + "grad_norm": 0.6225191950798035, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 4420 + }, + { + "epoch": 3.718002517834662, + "grad_norm": 0.5642715692520142, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 4430 + }, + { + "epoch": 3.726395300041964, + "grad_norm": 0.5703449845314026, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 4440 + }, + { + "epoch": 3.7347880822492656, + "grad_norm": 0.6029745936393738, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 4450 + }, + { + "epoch": 3.7431808644565674, + "grad_norm": 0.7089189887046814, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4460 + }, + { + "epoch": 3.751573646663869, + "grad_norm": 0.6230936050415039, + "learning_rate": 0.0002, + "loss": 1.4804, + "step": 4470 + }, + { + "epoch": 3.759966428871171, + "grad_norm": 0.5718494653701782, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 4480 + }, + { + "epoch": 3.7683592110784723, + "grad_norm": 0.5404117703437805, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 4490 + }, + { + "epoch": 3.7767519932857745, + "grad_norm": 0.5816529393196106, + "learning_rate": 0.0002, + "loss": 1.4707, + "step": 4500 + }, + { + "epoch": 3.785144775493076, + "grad_norm": 0.6314901113510132, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 4510 + }, + { + "epoch": 3.7935375577003776, + "grad_norm": 0.7639698386192322, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 4520 + }, + { + "epoch": 3.8019303399076794, + "grad_norm": 0.5727366209030151, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4530 + }, + { + "epoch": 3.810323122114981, + "grad_norm": 0.6467128396034241, + "learning_rate": 0.0002, + "loss": 1.5409, + "step": 4540 + }, + { + "epoch": 3.818715904322283, + "grad_norm": 0.6572837233543396, + "learning_rate": 0.0002, + "loss": 1.5266, + "step": 4550 + }, + { + "epoch": 3.8271086865295847, + "grad_norm": 0.5847418904304504, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4560 + }, + { + "epoch": 3.8355014687368865, + "grad_norm": 0.48820871114730835, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 4570 + }, + { + "epoch": 3.843894250944188, + "grad_norm": 1.2537429332733154, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 4580 + }, + { + "epoch": 3.8522870331514896, + "grad_norm": 0.6026989221572876, + "learning_rate": 0.0002, + "loss": 1.5522, + "step": 4590 + }, + { + "epoch": 3.8606798153587913, + "grad_norm": 0.5541417598724365, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 4600 + }, + { + "epoch": 3.869072597566093, + "grad_norm": 0.7668771147727966, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 4610 + }, + { + "epoch": 3.877465379773395, + "grad_norm": 0.6181227564811707, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4620 + }, + { + "epoch": 3.8858581619806967, + "grad_norm": 0.5842700004577637, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4630 + }, + { + "epoch": 3.8942509441879984, + "grad_norm": 0.5824751257896423, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 4640 + }, + { + "epoch": 3.9026437263952998, + "grad_norm": 0.6212735772132874, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4650 + }, + { + "epoch": 3.911036508602602, + "grad_norm": 0.6123346090316772, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 4660 + }, + { + "epoch": 3.9194292908099033, + "grad_norm": 0.518662691116333, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4670 + }, + { + "epoch": 3.927822073017205, + "grad_norm": 0.6963476538658142, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4680 + }, + { + "epoch": 3.936214855224507, + "grad_norm": 0.5192152261734009, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4690 + }, + { + "epoch": 3.9446076374318086, + "grad_norm": 0.5820888876914978, + "learning_rate": 0.0002, + "loss": 1.5312, + "step": 4700 + }, + { + "epoch": 3.9530004196391104, + "grad_norm": 0.6320387721061707, + "learning_rate": 0.0002, + "loss": 1.527, + "step": 4710 + }, + { + "epoch": 3.961393201846412, + "grad_norm": 0.6174548268318176, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 4720 + }, + { + "epoch": 3.969785984053714, + "grad_norm": 0.6691966652870178, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 4730 + }, + { + "epoch": 3.9781787662610153, + "grad_norm": 0.5972068309783936, + "learning_rate": 0.0002, + "loss": 1.4762, + "step": 4740 + }, + { + "epoch": 3.9865715484683175, + "grad_norm": 0.5759536027908325, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4750 + }, + { + "epoch": 3.994964330675619, + "grad_norm": 0.5886756777763367, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 4760 + }, + { + "epoch": 4.0, + "eval_loss": 1.8749940395355225, + "eval_runtime": 38.037, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 4766 + }, + { + "epoch": 4.003357112882921, + "grad_norm": 0.5915011167526245, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 4770 + }, + { + "epoch": 4.011749895090222, + "grad_norm": 0.8565000891685486, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 4780 + }, + { + "epoch": 4.020142677297524, + "grad_norm": 0.7753950953483582, + "learning_rate": 0.0002, + "loss": 1.3211, + "step": 4790 + }, + { + "epoch": 4.028535459504826, + "grad_norm": 0.6837254166603088, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 4800 + }, + { + "epoch": 4.036928241712127, + "grad_norm": 0.8374526500701904, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 4810 + }, + { + "epoch": 4.0453210239194295, + "grad_norm": 0.8717963099479675, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 4820 + }, + { + "epoch": 4.053713806126731, + "grad_norm": 0.7002043724060059, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4830 + }, + { + "epoch": 4.062106588334033, + "grad_norm": 1.0319572687149048, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 4840 + }, + { + "epoch": 4.070499370541334, + "grad_norm": 0.6746882200241089, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 4850 + }, + { + "epoch": 4.078892152748637, + "grad_norm": 0.8187578320503235, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 4860 + }, + { + "epoch": 4.087284934955938, + "grad_norm": 0.7888399362564087, + "learning_rate": 0.0002, + "loss": 1.368, + "step": 4870 + }, + { + "epoch": 4.095677717163239, + "grad_norm": 0.7149351239204407, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4880 + }, + { + "epoch": 4.1040704993705415, + "grad_norm": 0.9067983031272888, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 4890 + }, + { + "epoch": 4.112463281577843, + "grad_norm": 0.771186351776123, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 4900 + }, + { + "epoch": 4.120856063785145, + "grad_norm": 0.7756485342979431, + "learning_rate": 0.0002, + "loss": 1.2722, + "step": 4910 + }, + { + "epoch": 4.129248845992446, + "grad_norm": 0.7149116396903992, + "learning_rate": 0.0002, + "loss": 1.4138, + "step": 4920 + }, + { + "epoch": 4.137641628199749, + "grad_norm": 0.700442910194397, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4930 + }, + { + "epoch": 4.14603441040705, + "grad_norm": 0.8439189195632935, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4940 + }, + { + "epoch": 4.154427192614351, + "grad_norm": 0.6570779085159302, + "learning_rate": 0.0002, + "loss": 1.3511, + "step": 4950 + }, + { + "epoch": 4.1628199748216534, + "grad_norm": 0.886482298374176, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 4960 + }, + { + "epoch": 4.171212757028955, + "grad_norm": 0.7220938801765442, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 4970 + }, + { + "epoch": 4.179605539236257, + "grad_norm": 0.7185905575752258, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 4980 + }, + { + "epoch": 4.187998321443558, + "grad_norm": 0.7566333413124084, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 4990 + }, + { + "epoch": 4.1963911036508605, + "grad_norm": 0.6960445642471313, + "learning_rate": 0.0002, + "loss": 1.2771, + "step": 5000 + }, + { + "epoch": 4.204783885858162, + "grad_norm": 0.7727336883544922, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5010 + }, + { + "epoch": 4.213176668065464, + "grad_norm": 0.8038365244865417, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 5020 + }, + { + "epoch": 4.221569450272765, + "grad_norm": 0.7587628364562988, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5030 + }, + { + "epoch": 4.229962232480067, + "grad_norm": 0.928032398223877, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 5040 + }, + { + "epoch": 4.238355014687369, + "grad_norm": 0.7168642282485962, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5050 + }, + { + "epoch": 4.24674779689467, + "grad_norm": 0.7981422543525696, + "learning_rate": 0.0002, + "loss": 1.3767, + "step": 5060 + }, + { + "epoch": 4.2551405791019725, + "grad_norm": 0.6951150894165039, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5070 + }, + { + "epoch": 4.263533361309274, + "grad_norm": 0.7337371706962585, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 5080 + }, + { + "epoch": 4.271926143516576, + "grad_norm": 0.8367464542388916, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5090 + }, + { + "epoch": 4.280318925723877, + "grad_norm": 0.6744083166122437, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 5100 + }, + { + "epoch": 4.28871170793118, + "grad_norm": 0.9072301387786865, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 5110 + }, + { + "epoch": 4.297104490138481, + "grad_norm": 0.7703930735588074, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 5120 + }, + { + "epoch": 4.305497272345782, + "grad_norm": 0.6734083294868469, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5130 + }, + { + "epoch": 4.3138900545530845, + "grad_norm": 0.7835540175437927, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 5140 + }, + { + "epoch": 4.322282836760386, + "grad_norm": 1.0822200775146484, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 5150 + }, + { + "epoch": 4.330675618967688, + "grad_norm": 0.8432536721229553, + "learning_rate": 0.0002, + "loss": 1.4167, + "step": 5160 + }, + { + "epoch": 4.339068401174989, + "grad_norm": 0.6739283800125122, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 5170 + }, + { + "epoch": 4.347461183382292, + "grad_norm": 0.7395278811454773, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 5180 + }, + { + "epoch": 4.355853965589593, + "grad_norm": 0.7638891339302063, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 5190 + }, + { + "epoch": 4.364246747796894, + "grad_norm": 1.1222662925720215, + "learning_rate": 0.0002, + "loss": 1.34, + "step": 5200 + }, + { + "epoch": 4.3726395300041965, + "grad_norm": 0.9102525115013123, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5210 + }, + { + "epoch": 4.381032312211498, + "grad_norm": 0.7181593775749207, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 5220 + }, + { + "epoch": 4.3894250944188, + "grad_norm": 0.7813979387283325, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 5230 + }, + { + "epoch": 4.397817876626101, + "grad_norm": 0.8906185626983643, + "learning_rate": 0.0002, + "loss": 1.423, + "step": 5240 + }, + { + "epoch": 4.406210658833404, + "grad_norm": 0.7456443309783936, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5250 + }, + { + "epoch": 4.414603441040705, + "grad_norm": 0.8752070069313049, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 5260 + }, + { + "epoch": 4.422996223248007, + "grad_norm": 0.9560954570770264, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 5270 + }, + { + "epoch": 4.4313890054553084, + "grad_norm": 0.7227762341499329, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 5280 + }, + { + "epoch": 4.43978178766261, + "grad_norm": 0.8141599893569946, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 5290 + }, + { + "epoch": 4.448174569869912, + "grad_norm": 0.928382158279419, + "learning_rate": 0.0002, + "loss": 1.381, + "step": 5300 + }, + { + "epoch": 4.456567352077213, + "grad_norm": 0.7719997763633728, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5310 + }, + { + "epoch": 4.4649601342845155, + "grad_norm": 0.8081879615783691, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 5320 + }, + { + "epoch": 4.473352916491817, + "grad_norm": 0.7903412580490112, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 5330 + }, + { + "epoch": 4.481745698699119, + "grad_norm": 0.7751287221908569, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5340 + }, + { + "epoch": 4.49013848090642, + "grad_norm": 0.8287544250488281, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5350 + }, + { + "epoch": 4.498531263113723, + "grad_norm": 0.7431012392044067, + "learning_rate": 0.0002, + "loss": 1.3841, + "step": 5360 + }, + { + "epoch": 4.506924045321024, + "grad_norm": 0.8648661971092224, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5370 + }, + { + "epoch": 4.515316827528325, + "grad_norm": 0.9314997792243958, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5380 + }, + { + "epoch": 4.5237096097356275, + "grad_norm": 0.7530864477157593, + "learning_rate": 0.0002, + "loss": 1.354, + "step": 5390 + }, + { + "epoch": 4.532102391942929, + "grad_norm": 0.8739821910858154, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 5400 + }, + { + "epoch": 4.540495174150231, + "grad_norm": 0.8090344667434692, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 4.548887956357532, + "grad_norm": 0.7530879974365234, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 5420 + }, + { + "epoch": 4.557280738564835, + "grad_norm": 0.8787251114845276, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 5430 + }, + { + "epoch": 4.565673520772136, + "grad_norm": 0.813961923122406, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 5440 + }, + { + "epoch": 4.574066302979437, + "grad_norm": 0.7778232097625732, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 5450 + }, + { + "epoch": 4.5824590851867395, + "grad_norm": 0.7323020696640015, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5460 + }, + { + "epoch": 4.590851867394041, + "grad_norm": 0.7826765179634094, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5470 + }, + { + "epoch": 4.599244649601343, + "grad_norm": 0.7245969772338867, + "learning_rate": 0.0002, + "loss": 1.4068, + "step": 5480 + }, + { + "epoch": 4.607637431808644, + "grad_norm": 0.7697308659553528, + "learning_rate": 0.0002, + "loss": 1.4276, + "step": 5490 + }, + { + "epoch": 4.616030214015947, + "grad_norm": 0.8053571581840515, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5500 + }, + { + "epoch": 4.624422996223248, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 5510 + }, + { + "epoch": 4.632815778430549, + "grad_norm": 0.7398585677146912, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 5520 + }, + { + "epoch": 4.6412085606378515, + "grad_norm": 0.7896319031715393, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 5530 + }, + { + "epoch": 4.649601342845153, + "grad_norm": 0.8290980458259583, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5540 + }, + { + "epoch": 4.657994125052455, + "grad_norm": 0.8232647776603699, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 5550 + }, + { + "epoch": 4.666386907259756, + "grad_norm": 0.9154987335205078, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5560 + }, + { + "epoch": 4.674779689467059, + "grad_norm": 0.8400886654853821, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 5570 + }, + { + "epoch": 4.68317247167436, + "grad_norm": 0.7312718629837036, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 5580 + }, + { + "epoch": 4.691565253881662, + "grad_norm": 0.8043803572654724, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5590 + }, + { + "epoch": 4.6999580360889635, + "grad_norm": 0.7966225147247314, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5600 + }, + { + "epoch": 4.708350818296266, + "grad_norm": 0.881574809551239, + "learning_rate": 0.0002, + "loss": 1.3429, + "step": 5610 + }, + { + "epoch": 4.716743600503567, + "grad_norm": 0.7252084016799927, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5620 + }, + { + "epoch": 4.725136382710868, + "grad_norm": 0.7726518511772156, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 5630 + }, + { + "epoch": 4.7335291649181706, + "grad_norm": 0.7306379079818726, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5640 + }, + { + "epoch": 4.741921947125472, + "grad_norm": 0.8029969334602356, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 5650 + }, + { + "epoch": 4.750314729332774, + "grad_norm": 0.9103893637657166, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 5660 + }, + { + "epoch": 4.758707511540075, + "grad_norm": 0.8783416748046875, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5670 + }, + { + "epoch": 4.767100293747378, + "grad_norm": 0.6807119846343994, + "learning_rate": 0.0002, + "loss": 1.3427, + "step": 5680 + }, + { + "epoch": 4.775493075954679, + "grad_norm": 0.7103772759437561, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 5690 + }, + { + "epoch": 4.78388585816198, + "grad_norm": 0.8472093343734741, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 5700 + }, + { + "epoch": 4.7922786403692825, + "grad_norm": 0.851847231388092, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 5710 + }, + { + "epoch": 4.800671422576584, + "grad_norm": 0.9084636569023132, + "learning_rate": 0.0002, + "loss": 1.3965, + "step": 5720 + }, + { + "epoch": 4.809064204783886, + "grad_norm": 0.7628585696220398, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 5730 + }, + { + "epoch": 4.817456986991187, + "grad_norm": 0.775580883026123, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 5740 + }, + { + "epoch": 4.82584976919849, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 1.4573, + "step": 5750 + }, + { + "epoch": 4.834242551405791, + "grad_norm": 0.7021728754043579, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 5760 + }, + { + "epoch": 4.842635333613092, + "grad_norm": 0.7810541391372681, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5770 + }, + { + "epoch": 4.8510281158203945, + "grad_norm": 0.7290041446685791, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5780 + }, + { + "epoch": 4.859420898027696, + "grad_norm": 0.9059709906578064, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5790 + }, + { + "epoch": 4.867813680234998, + "grad_norm": 0.8338062167167664, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 5800 + }, + { + "epoch": 4.876206462442299, + "grad_norm": 0.830926775932312, + "learning_rate": 0.0002, + "loss": 1.395, + "step": 5810 + }, + { + "epoch": 4.884599244649602, + "grad_norm": 0.7818633317947388, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 5820 + }, + { + "epoch": 4.892992026856903, + "grad_norm": 0.8143376708030701, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5830 + }, + { + "epoch": 4.901384809064205, + "grad_norm": 0.7754496335983276, + "learning_rate": 0.0002, + "loss": 1.3583, + "step": 5840 + }, + { + "epoch": 4.9097775912715065, + "grad_norm": 0.7154468297958374, + "learning_rate": 0.0002, + "loss": 1.4036, + "step": 5850 + }, + { + "epoch": 4.918170373478809, + "grad_norm": 0.6829783916473389, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 5860 + }, + { + "epoch": 4.92656315568611, + "grad_norm": 0.784919261932373, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5870 + }, + { + "epoch": 4.934955937893411, + "grad_norm": 0.8168354034423828, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5880 + }, + { + "epoch": 4.943348720100714, + "grad_norm": 0.7356618642807007, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 5890 + }, + { + "epoch": 4.951741502308015, + "grad_norm": 0.7399224042892456, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 5900 + }, + { + "epoch": 4.960134284515317, + "grad_norm": 0.7430436015129089, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 5910 + }, + { + "epoch": 4.9685270667226185, + "grad_norm": 0.7587705850601196, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 5920 + }, + { + "epoch": 4.976919848929921, + "grad_norm": 0.9103638529777527, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5930 + }, + { + "epoch": 4.985312631137222, + "grad_norm": 0.7357394695281982, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5940 + }, + { + "epoch": 4.993705413344523, + "grad_norm": 0.7371547222137451, + "learning_rate": 0.0002, + "loss": 1.3988, + "step": 5950 + }, + { + "epoch": 4.9995803608896345, + "eval_loss": 1.9367210865020752, + "eval_runtime": 37.9833, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 5957 + }, + { + "epoch": 5.0020981955518256, + "grad_norm": 0.7783351540565491, + "learning_rate": 0.0002, + "loss": 1.3876, + "step": 5960 + }, + { + "epoch": 5.010490977759127, + "grad_norm": 0.9268898367881775, + "learning_rate": 0.0002, + "loss": 1.2387, + "step": 5970 + }, + { + "epoch": 5.018883759966429, + "grad_norm": 0.9562761783599854, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 5980 + }, + { + "epoch": 5.02727654217373, + "grad_norm": 0.9391738176345825, + "learning_rate": 0.0002, + "loss": 1.205, + "step": 5990 + }, + { + "epoch": 5.035669324381033, + "grad_norm": 0.850326418876648, + "learning_rate": 0.0002, + "loss": 1.2112, + "step": 6000 + }, + { + "epoch": 5.044062106588334, + "grad_norm": 0.8442679643630981, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 6010 + }, + { + "epoch": 5.052454888795635, + "grad_norm": 1.2147290706634521, + "learning_rate": 0.0002, + "loss": 1.1677, + "step": 6020 + }, + { + "epoch": 5.0608476710029375, + "grad_norm": 0.9732922315597534, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 6030 + }, + { + "epoch": 5.069240453210239, + "grad_norm": 0.9354516267776489, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 6040 + }, + { + "epoch": 5.077633235417541, + "grad_norm": 0.9681560397148132, + "learning_rate": 0.0002, + "loss": 1.1918, + "step": 6050 + }, + { + "epoch": 5.086026017624842, + "grad_norm": 0.9500439763069153, + "learning_rate": 0.0002, + "loss": 1.2146, + "step": 6060 + }, + { + "epoch": 5.094418799832145, + "grad_norm": 0.8693879246711731, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 6070 + }, + { + "epoch": 5.102811582039446, + "grad_norm": 1.1066458225250244, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 6080 + }, + { + "epoch": 5.111204364246748, + "grad_norm": 0.9530285000801086, + "learning_rate": 0.0002, + "loss": 1.2135, + "step": 6090 + }, + { + "epoch": 5.1195971464540495, + "grad_norm": 0.9323630928993225, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 6100 + }, + { + "epoch": 5.127989928661351, + "grad_norm": 0.9040294885635376, + "learning_rate": 0.0002, + "loss": 1.2434, + "step": 6110 + }, + { + "epoch": 5.136382710868653, + "grad_norm": 0.9981122612953186, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 6120 + }, + { + "epoch": 5.144775493075954, + "grad_norm": 0.9070921540260315, + "learning_rate": 0.0002, + "loss": 1.2648, + "step": 6130 + }, + { + "epoch": 5.153168275283257, + "grad_norm": 1.043802261352539, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 6140 + }, + { + "epoch": 5.161561057490558, + "grad_norm": 1.0889761447906494, + "learning_rate": 0.0002, + "loss": 1.1865, + "step": 6150 + }, + { + "epoch": 5.16995383969786, + "grad_norm": 0.9908999800682068, + "learning_rate": 0.0002, + "loss": 1.2498, + "step": 6160 + }, + { + "epoch": 5.1783466219051615, + "grad_norm": 1.099233865737915, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 6170 + }, + { + "epoch": 5.186739404112464, + "grad_norm": 0.9536478519439697, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 6180 + }, + { + "epoch": 5.195132186319765, + "grad_norm": 0.8672952055931091, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 6190 + }, + { + "epoch": 5.203524968527066, + "grad_norm": 1.0116329193115234, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 6200 + }, + { + "epoch": 5.211917750734369, + "grad_norm": 0.9327153563499451, + "learning_rate": 0.0002, + "loss": 1.1813, + "step": 6210 + }, + { + "epoch": 5.22031053294167, + "grad_norm": 0.85637366771698, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 6220 + }, + { + "epoch": 5.228703315148972, + "grad_norm": 1.0490736961364746, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 6230 + }, + { + "epoch": 5.2370960973562735, + "grad_norm": 0.8849565982818604, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 6240 + }, + { + "epoch": 5.245488879563576, + "grad_norm": 0.8852671980857849, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 6250 + }, + { + "epoch": 5.253881661770877, + "grad_norm": 0.9146860241889954, + "learning_rate": 0.0002, + "loss": 1.275, + "step": 6260 + }, + { + "epoch": 5.262274443978178, + "grad_norm": 1.0188325643539429, + "learning_rate": 0.0002, + "loss": 1.2543, + "step": 6270 + }, + { + "epoch": 5.270667226185481, + "grad_norm": 1.0053156614303589, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 6280 + }, + { + "epoch": 5.279060008392782, + "grad_norm": 0.9962273836135864, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 6290 + }, + { + "epoch": 5.287452790600084, + "grad_norm": 1.000300645828247, + "learning_rate": 0.0002, + "loss": 1.2487, + "step": 6300 + }, + { + "epoch": 5.295845572807385, + "grad_norm": 0.9821932911872864, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 6310 + }, + { + "epoch": 5.304238355014688, + "grad_norm": 1.0103896856307983, + "learning_rate": 0.0002, + "loss": 1.2964, + "step": 6320 + }, + { + "epoch": 5.312631137221989, + "grad_norm": 0.9323601722717285, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 6330 + }, + { + "epoch": 5.321023919429291, + "grad_norm": 1.0668879747390747, + "learning_rate": 0.0002, + "loss": 1.3165, + "step": 6340 + }, + { + "epoch": 5.3294167016365925, + "grad_norm": 0.9666323065757751, + "learning_rate": 0.0002, + "loss": 1.2411, + "step": 6350 + }, + { + "epoch": 5.337809483843894, + "grad_norm": 0.9439574480056763, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 6360 + }, + { + "epoch": 5.346202266051196, + "grad_norm": 1.0229361057281494, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 6370 + }, + { + "epoch": 5.354595048258497, + "grad_norm": 0.8522404432296753, + "learning_rate": 0.0002, + "loss": 1.2021, + "step": 6380 + }, + { + "epoch": 5.3629878304658, + "grad_norm": 1.3732287883758545, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 6390 + }, + { + "epoch": 5.371380612673101, + "grad_norm": 0.8201091885566711, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 6400 + }, + { + "epoch": 5.379773394880403, + "grad_norm": 0.8874436616897583, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6410 + }, + { + "epoch": 5.3881661770877045, + "grad_norm": 1.0118640661239624, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 6420 + }, + { + "epoch": 5.396558959295007, + "grad_norm": 1.0468370914459229, + "learning_rate": 0.0002, + "loss": 1.2952, + "step": 6430 + }, + { + "epoch": 5.404951741502308, + "grad_norm": 0.941806972026825, + "learning_rate": 0.0002, + "loss": 1.2057, + "step": 6440 + }, + { + "epoch": 5.413344523709609, + "grad_norm": 0.9860424399375916, + "learning_rate": 0.0002, + "loss": 1.3289, + "step": 6450 + }, + { + "epoch": 5.421737305916912, + "grad_norm": 1.009628176689148, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 6460 + }, + { + "epoch": 5.430130088124213, + "grad_norm": 0.9842159748077393, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 6470 + }, + { + "epoch": 5.438522870331515, + "grad_norm": 0.9935571551322937, + "learning_rate": 0.0002, + "loss": 1.2277, + "step": 6480 + }, + { + "epoch": 5.4469156525388165, + "grad_norm": 0.8872362971305847, + "learning_rate": 0.0002, + "loss": 1.2392, + "step": 6490 + }, + { + "epoch": 5.455308434746119, + "grad_norm": 0.9530836939811707, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 6500 + }, + { + "epoch": 5.46370121695342, + "grad_norm": 0.8111279010772705, + "learning_rate": 0.0002, + "loss": 1.2138, + "step": 6510 + }, + { + "epoch": 5.472093999160721, + "grad_norm": 1.0474516153335571, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 6520 + }, + { + "epoch": 5.480486781368024, + "grad_norm": 1.0228482484817505, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 6530 + }, + { + "epoch": 5.488879563575325, + "grad_norm": 1.0299347639083862, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 6540 + }, + { + "epoch": 5.497272345782627, + "grad_norm": 0.9105098247528076, + "learning_rate": 0.0002, + "loss": 1.3163, + "step": 6550 + }, + { + "epoch": 5.5056651279899285, + "grad_norm": 1.2459523677825928, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 6560 + }, + { + "epoch": 5.514057910197231, + "grad_norm": 1.0630481243133545, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 6570 + }, + { + "epoch": 5.522450692404532, + "grad_norm": 0.8310980796813965, + "learning_rate": 0.0002, + "loss": 1.3003, + "step": 6580 + }, + { + "epoch": 5.530843474611833, + "grad_norm": 1.102723479270935, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 6590 + }, + { + "epoch": 5.539236256819136, + "grad_norm": 0.9586807489395142, + "learning_rate": 0.0002, + "loss": 1.2889, + "step": 6600 + }, + { + "epoch": 5.547629039026437, + "grad_norm": 0.976191520690918, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 6610 + }, + { + "epoch": 5.556021821233739, + "grad_norm": 0.9943762421607971, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 6620 + }, + { + "epoch": 5.56441460344104, + "grad_norm": 0.8788089156150818, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 6630 + }, + { + "epoch": 5.572807385648343, + "grad_norm": 0.9866173267364502, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6640 + }, + { + "epoch": 5.581200167855644, + "grad_norm": 1.0791642665863037, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 6650 + }, + { + "epoch": 5.589592950062946, + "grad_norm": 0.836482584476471, + "learning_rate": 0.0002, + "loss": 1.2806, + "step": 6660 + }, + { + "epoch": 5.5979857322702475, + "grad_norm": 0.9841130971908569, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 6670 + }, + { + "epoch": 5.60637851447755, + "grad_norm": 0.9678813815116882, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 6680 + }, + { + "epoch": 5.614771296684851, + "grad_norm": 0.9033233523368835, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 6690 + }, + { + "epoch": 5.623164078892152, + "grad_norm": 0.8691515922546387, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 6700 + }, + { + "epoch": 5.631556861099455, + "grad_norm": 0.8971360921859741, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 6710 + }, + { + "epoch": 5.639949643306756, + "grad_norm": 0.9377756118774414, + "learning_rate": 0.0002, + "loss": 1.2266, + "step": 6720 + }, + { + "epoch": 5.648342425514058, + "grad_norm": 0.908762514591217, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 6730 + }, + { + "epoch": 5.6567352077213595, + "grad_norm": 1.0503337383270264, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 6740 + }, + { + "epoch": 5.665127989928662, + "grad_norm": 1.030267357826233, + "learning_rate": 0.0002, + "loss": 1.3604, + "step": 6750 + }, + { + "epoch": 5.673520772135963, + "grad_norm": 0.9150485992431641, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 6760 + }, + { + "epoch": 5.681913554343264, + "grad_norm": 1.0300343036651611, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 6770 + }, + { + "epoch": 5.690306336550567, + "grad_norm": 1.1242924928665161, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 6780 + }, + { + "epoch": 5.698699118757868, + "grad_norm": 0.9489498138427734, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6790 + }, + { + "epoch": 5.70709190096517, + "grad_norm": 0.8829707503318787, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 6800 + }, + { + "epoch": 5.7154846831724715, + "grad_norm": 1.01392662525177, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 6810 + }, + { + "epoch": 5.723877465379774, + "grad_norm": 0.9234510064125061, + "learning_rate": 0.0002, + "loss": 1.3029, + "step": 6820 + }, + { + "epoch": 5.732270247587075, + "grad_norm": 0.9439187049865723, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 6830 + }, + { + "epoch": 5.740663029794376, + "grad_norm": 0.8833441734313965, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6840 + }, + { + "epoch": 5.749055812001679, + "grad_norm": 0.9394439458847046, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 6850 + }, + { + "epoch": 5.75744859420898, + "grad_norm": 0.9980010390281677, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 6860 + }, + { + "epoch": 5.765841376416282, + "grad_norm": 0.9612377882003784, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 6870 + }, + { + "epoch": 5.7742341586235835, + "grad_norm": 1.0817323923110962, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 6880 + }, + { + "epoch": 5.782626940830886, + "grad_norm": 0.8445103168487549, + "learning_rate": 0.0002, + "loss": 1.2485, + "step": 6890 + }, + { + "epoch": 5.791019723038187, + "grad_norm": 0.8535459041595459, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 6900 + }, + { + "epoch": 5.799412505245489, + "grad_norm": 0.9131284356117249, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 6910 + }, + { + "epoch": 5.807805287452791, + "grad_norm": 0.8627726435661316, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 6920 + }, + { + "epoch": 5.816198069660093, + "grad_norm": 0.8599951863288879, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 6930 + }, + { + "epoch": 5.824590851867394, + "grad_norm": 1.0746861696243286, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 6940 + }, + { + "epoch": 5.8329836340746954, + "grad_norm": 1.0220543146133423, + "learning_rate": 0.0002, + "loss": 1.2653, + "step": 6950 + }, + { + "epoch": 5.841376416281998, + "grad_norm": 0.8891388177871704, + "learning_rate": 0.0002, + "loss": 1.3168, + "step": 6960 + }, + { + "epoch": 5.849769198489299, + "grad_norm": 1.1404683589935303, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 6970 + }, + { + "epoch": 5.858161980696601, + "grad_norm": 0.9665380120277405, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 6980 + }, + { + "epoch": 5.8665547629039025, + "grad_norm": 0.9837968945503235, + "learning_rate": 0.0002, + "loss": 1.2622, + "step": 6990 + }, + { + "epoch": 5.874947545111205, + "grad_norm": 1.0278598070144653, + "learning_rate": 0.0002, + "loss": 1.2973, + "step": 7000 + }, + { + "epoch": 5.883340327318506, + "grad_norm": 0.9990253448486328, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 7010 + }, + { + "epoch": 5.891733109525807, + "grad_norm": 0.9705647230148315, + "learning_rate": 0.0002, + "loss": 1.3508, + "step": 7020 + }, + { + "epoch": 5.90012589173311, + "grad_norm": 0.9672252535820007, + "learning_rate": 0.0002, + "loss": 1.335, + "step": 7030 + }, + { + "epoch": 5.908518673940411, + "grad_norm": 0.9467034339904785, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 7040 + }, + { + "epoch": 5.916911456147713, + "grad_norm": 0.9506469964981079, + "learning_rate": 0.0002, + "loss": 1.2704, + "step": 7050 + }, + { + "epoch": 5.9253042383550145, + "grad_norm": 0.8936163783073425, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 7060 + }, + { + "epoch": 5.933697020562317, + "grad_norm": 0.956101655960083, + "learning_rate": 0.0002, + "loss": 1.2702, + "step": 7070 + }, + { + "epoch": 5.942089802769618, + "grad_norm": 0.893535852432251, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 7080 + }, + { + "epoch": 5.950482584976919, + "grad_norm": 1.0313799381256104, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 7090 + }, + { + "epoch": 5.958875367184222, + "grad_norm": 0.8567915558815002, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 7100 + }, + { + "epoch": 5.967268149391523, + "grad_norm": 0.9683501720428467, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 7110 + }, + { + "epoch": 5.975660931598825, + "grad_norm": 0.9401984214782715, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 7120 + }, + { + "epoch": 5.9840537138061265, + "grad_norm": 1.0316764116287231, + "learning_rate": 0.0002, + "loss": 1.3211, + "step": 7130 + }, + { + "epoch": 5.992446496013429, + "grad_norm": 0.9335392713546753, + "learning_rate": 0.0002, + "loss": 1.2445, + "step": 7140 + }, + { + "epoch": 6.0, + "eval_loss": 2.041194438934326, + "eval_runtime": 37.9642, + "eval_samples_per_second": 13.565, + "eval_steps_per_second": 1.712, + "step": 7149 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.308397127406715e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-7149/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1fab7d5bb84bcde374d4594e5f95d3af718012d7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a43ed1af74e19733a814c62273ff57e003e31753521b992472024157a06b609 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..88c19b25ab5e24f71ee9cbb725227c0f5f9e04dd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74b9176a9d579c3f4cd8bfe3fdf3755baa81b29a2ae39286aed2802e5523b933 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..91900d20735546783fda083465df8a95091b6694 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d14f9987a9ae4efb29316b2fe50716f90a85adb22a0da1a8e428957e52b3f4e1 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f39a46ce11491eeb083306232deab0ed9c0b9f7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b6f4f5135c68f8031b127cfb5c3114ba5e0c32e28f7a54e5af66558523f2a92 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae49cbad74cf8196954f6ae9a6dfcdf72e929aa8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/trainer_state.json @@ -0,0 +1,5927 @@ +{ + "best_metric": 1.807437539100647, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", + "epoch": 6.9995803608896345, + "eval_steps": 10, + "global_step": 8340, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + }, + { + "epoch": 1.0071338648762065, + "grad_norm": 0.3235187232494354, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1200 + }, + { + "epoch": 1.0155266470835083, + "grad_norm": 0.34884774684906006, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1210 + }, + { + "epoch": 1.0239194292908098, + "grad_norm": 0.3215438425540924, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1220 + }, + { + "epoch": 1.0323122114981116, + "grad_norm": 0.312084823846817, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1230 + }, + { + "epoch": 1.0407049937054134, + "grad_norm": 0.33597758412361145, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1240 + }, + { + "epoch": 1.0490977759127151, + "grad_norm": 0.3421499729156494, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1250 + }, + { + "epoch": 1.0574905581200167, + "grad_norm": 0.3458889126777649, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1260 + }, + { + "epoch": 1.0658833403273185, + "grad_norm": 0.3956579864025116, + "learning_rate": 0.0002, + "loss": 1.6929, + "step": 1270 + }, + { + "epoch": 1.0742761225346202, + "grad_norm": 0.3217819035053253, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1280 + }, + { + "epoch": 1.082668904741922, + "grad_norm": 0.31379663944244385, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1290 + }, + { + "epoch": 1.0910616869492236, + "grad_norm": 0.37231558561325073, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1300 + }, + { + "epoch": 1.0994544691565253, + "grad_norm": 0.35857918858528137, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 1310 + }, + { + "epoch": 1.1078472513638271, + "grad_norm": 0.36637991666793823, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.1162400335711289, + "grad_norm": 0.3436494469642639, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1330 + }, + { + "epoch": 1.1246328157784307, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 1340 + }, + { + "epoch": 1.1330255979857322, + "grad_norm": 0.34587544202804565, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 1350 + }, + { + "epoch": 1.141418380193034, + "grad_norm": 0.35142362117767334, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1360 + }, + { + "epoch": 1.1498111624003358, + "grad_norm": 0.3511804938316345, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1370 + }, + { + "epoch": 1.1582039446076373, + "grad_norm": 0.3549560308456421, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 1380 + }, + { + "epoch": 1.166596726814939, + "grad_norm": 0.35797521471977234, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1390 + }, + { + "epoch": 1.1749895090222409, + "grad_norm": 0.37255269289016724, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 1400 + }, + { + "epoch": 1.1833822912295426, + "grad_norm": 0.3680652379989624, + "learning_rate": 0.0002, + "loss": 1.7274, + "step": 1410 + }, + { + "epoch": 1.1917750734368444, + "grad_norm": 0.400831013917923, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 1420 + }, + { + "epoch": 1.200167855644146, + "grad_norm": 0.39571020007133484, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1430 + }, + { + "epoch": 1.2085606378514477, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 1440 + }, + { + "epoch": 1.2169534200587495, + "grad_norm": 0.3901960551738739, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1450 + }, + { + "epoch": 1.2253462022660513, + "grad_norm": 0.36490726470947266, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1460 + }, + { + "epoch": 1.2337389844733528, + "grad_norm": 0.3739864230155945, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1470 + }, + { + "epoch": 1.2421317666806546, + "grad_norm": 0.39061254262924194, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 1480 + }, + { + "epoch": 1.2505245488879564, + "grad_norm": 0.37198659777641296, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 1490 + }, + { + "epoch": 1.2589173310952582, + "grad_norm": 0.3420586884021759, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1500 + }, + { + "epoch": 1.2673101133025597, + "grad_norm": 0.4094347655773163, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1510 + }, + { + "epoch": 1.2757028955098615, + "grad_norm": 0.38997703790664673, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1520 + }, + { + "epoch": 1.2840956777171633, + "grad_norm": 0.35702022910118103, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 1530 + }, + { + "epoch": 1.292488459924465, + "grad_norm": 0.3892163336277008, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1540 + }, + { + "epoch": 1.3008812421317666, + "grad_norm": 0.33174318075180054, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 1550 + }, + { + "epoch": 1.3092740243390684, + "grad_norm": 0.40701809525489807, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 1560 + }, + { + "epoch": 1.3176668065463701, + "grad_norm": 0.36324232816696167, + "learning_rate": 0.0002, + "loss": 1.7229, + "step": 1570 + }, + { + "epoch": 1.326059588753672, + "grad_norm": 0.3748789429664612, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 1580 + }, + { + "epoch": 1.3344523709609737, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 1590 + }, + { + "epoch": 1.3428451531682752, + "grad_norm": 0.52373206615448, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1600 + }, + { + "epoch": 1.351237935375577, + "grad_norm": 0.40408164262771606, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1610 + }, + { + "epoch": 1.3596307175828788, + "grad_norm": 0.3818126320838928, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 1.3680234997901803, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1630 + }, + { + "epoch": 1.3764162819974821, + "grad_norm": 0.33777865767478943, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 1640 + }, + { + "epoch": 1.384809064204784, + "grad_norm": 0.36344218254089355, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1650 + }, + { + "epoch": 1.3932018464120857, + "grad_norm": 0.3880128562450409, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 1660 + }, + { + "epoch": 1.4015946286193874, + "grad_norm": 0.3906225562095642, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1670 + }, + { + "epoch": 1.409987410826689, + "grad_norm": 0.35857489705085754, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 1680 + }, + { + "epoch": 1.4183801930339908, + "grad_norm": 0.3627418279647827, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1690 + }, + { + "epoch": 1.4267729752412925, + "grad_norm": 0.41963326930999756, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1700 + }, + { + "epoch": 1.435165757448594, + "grad_norm": 0.36280378699302673, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 1.4435585396558959, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.0002, + "loss": 1.7775, + "step": 1720 + }, + { + "epoch": 1.4519513218631976, + "grad_norm": 0.3635849356651306, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 1730 + }, + { + "epoch": 1.4603441040704994, + "grad_norm": 0.4885194003582001, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1740 + }, + { + "epoch": 1.4687368862778012, + "grad_norm": 0.35194680094718933, + "learning_rate": 0.0002, + "loss": 1.6661, + "step": 1750 + }, + { + "epoch": 1.4771296684851027, + "grad_norm": 0.34906691312789917, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1760 + }, + { + "epoch": 1.4855224506924045, + "grad_norm": 0.3994184732437134, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1770 + }, + { + "epoch": 1.4939152328997063, + "grad_norm": 0.3599298298358917, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 1780 + }, + { + "epoch": 1.5023080151070078, + "grad_norm": 0.3794984221458435, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 1790 + }, + { + "epoch": 1.5107007973143096, + "grad_norm": 0.36289724707603455, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 1800 + }, + { + "epoch": 1.5190935795216114, + "grad_norm": 0.38057321310043335, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1810 + }, + { + "epoch": 1.5274863617289132, + "grad_norm": 0.3771969676017761, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 1820 + }, + { + "epoch": 1.535879143936215, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 1830 + }, + { + "epoch": 1.5442719261435167, + "grad_norm": 0.41352227330207825, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 1840 + }, + { + "epoch": 1.5526647083508183, + "grad_norm": 0.35711410641670227, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1850 + }, + { + "epoch": 1.56105749055812, + "grad_norm": 0.40607622265815735, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1860 + }, + { + "epoch": 1.5694502727654216, + "grad_norm": 0.3428550660610199, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 1870 + }, + { + "epoch": 1.5778430549727234, + "grad_norm": 0.3695414066314697, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1880 + }, + { + "epoch": 1.5862358371800251, + "grad_norm": 0.3798272907733917, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1890 + }, + { + "epoch": 1.594628619387327, + "grad_norm": 0.3415829837322235, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1900 + }, + { + "epoch": 1.6030214015946287, + "grad_norm": 0.3575693666934967, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 1910 + }, + { + "epoch": 1.6114141838019305, + "grad_norm": 0.3180370628833771, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1920 + }, + { + "epoch": 1.619806966009232, + "grad_norm": 0.5018689036369324, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1930 + }, + { + "epoch": 1.6281997482165338, + "grad_norm": 0.35676372051239014, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1940 + }, + { + "epoch": 1.6365925304238353, + "grad_norm": 0.3740452229976654, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1950 + }, + { + "epoch": 1.6449853126311371, + "grad_norm": 0.36584731936454773, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 1960 + }, + { + "epoch": 1.653378094838439, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 1970 + }, + { + "epoch": 1.6617708770457407, + "grad_norm": 0.4114968776702881, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 1980 + }, + { + "epoch": 1.6701636592530424, + "grad_norm": 0.3665498197078705, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1990 + }, + { + "epoch": 1.6785564414603442, + "grad_norm": 0.36579379439353943, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 2000 + }, + { + "epoch": 1.6869492236676458, + "grad_norm": 0.3813064694404602, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 2010 + }, + { + "epoch": 1.6953420058749475, + "grad_norm": 0.33390694856643677, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 2020 + }, + { + "epoch": 1.7037347880822493, + "grad_norm": 0.3668614327907562, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 2030 + }, + { + "epoch": 1.7121275702895509, + "grad_norm": 0.352028489112854, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2040 + }, + { + "epoch": 1.7205203524968526, + "grad_norm": 0.33639830350875854, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2050 + }, + { + "epoch": 1.7289131347041544, + "grad_norm": 0.39217695593833923, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 2060 + }, + { + "epoch": 1.7373059169114562, + "grad_norm": 0.42593324184417725, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 2070 + }, + { + "epoch": 1.745698699118758, + "grad_norm": 0.362215518951416, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 2080 + }, + { + "epoch": 1.7540914813260597, + "grad_norm": 0.4087955057621002, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2090 + }, + { + "epoch": 1.7624842635333613, + "grad_norm": 0.35127750039100647, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2100 + }, + { + "epoch": 1.770877045740663, + "grad_norm": 0.33677494525909424, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 2110 + }, + { + "epoch": 1.7792698279479646, + "grad_norm": 0.39616644382476807, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 2120 + }, + { + "epoch": 1.7876626101552664, + "grad_norm": 0.4705100953578949, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 2130 + }, + { + "epoch": 1.7960553923625682, + "grad_norm": 0.3893914818763733, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 2140 + }, + { + "epoch": 1.80444817456987, + "grad_norm": 0.3344813585281372, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 2150 + }, + { + "epoch": 1.8128409567771717, + "grad_norm": 0.36502110958099365, + "learning_rate": 0.0002, + "loss": 1.8329, + "step": 2160 + }, + { + "epoch": 1.8212337389844735, + "grad_norm": 0.3422985374927521, + "learning_rate": 0.0002, + "loss": 1.753, + "step": 2170 + }, + { + "epoch": 1.829626521191775, + "grad_norm": 0.44039851427078247, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2180 + }, + { + "epoch": 1.8380193033990768, + "grad_norm": 0.40052926540374756, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2190 + }, + { + "epoch": 1.8464120856063784, + "grad_norm": 0.3614487648010254, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 2200 + }, + { + "epoch": 1.8548048678136801, + "grad_norm": 0.3800305426120758, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2210 + }, + { + "epoch": 1.863197650020982, + "grad_norm": 0.3942040205001831, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 2220 + }, + { + "epoch": 1.8715904322282837, + "grad_norm": 0.36896875500679016, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 2230 + }, + { + "epoch": 1.8799832144355855, + "grad_norm": 0.3666089177131653, + "learning_rate": 0.0002, + "loss": 1.7371, + "step": 2240 + }, + { + "epoch": 1.8883759966428872, + "grad_norm": 0.3759142756462097, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 2250 + }, + { + "epoch": 1.8967687788501888, + "grad_norm": 0.3711695671081543, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 2260 + }, + { + "epoch": 1.9051615610574906, + "grad_norm": 0.37000006437301636, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2270 + }, + { + "epoch": 1.9135543432647921, + "grad_norm": 0.37376025319099426, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 2280 + }, + { + "epoch": 1.921947125472094, + "grad_norm": 0.3794068694114685, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 2290 + }, + { + "epoch": 1.9303399076793957, + "grad_norm": 0.42530709505081177, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 2300 + }, + { + "epoch": 1.9387326898866974, + "grad_norm": 0.3381672203540802, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 2310 + }, + { + "epoch": 1.9471254720939992, + "grad_norm": 0.3553236722946167, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 2320 + }, + { + "epoch": 1.955518254301301, + "grad_norm": 0.38204774260520935, + "learning_rate": 0.0002, + "loss": 1.715, + "step": 2330 + }, + { + "epoch": 1.9639110365086025, + "grad_norm": 0.4318946301937103, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 2340 + }, + { + "epoch": 1.9723038187159043, + "grad_norm": 0.3563119173049927, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 2350 + }, + { + "epoch": 1.980696600923206, + "grad_norm": 0.362532377243042, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 2360 + }, + { + "epoch": 1.9890893831305076, + "grad_norm": 0.40200483798980713, + "learning_rate": 0.0002, + "loss": 1.6992, + "step": 2370 + }, + { + "epoch": 1.9974821653378094, + "grad_norm": 0.37397003173828125, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.807437539100647, + "eval_runtime": 38.0038, + "eval_samples_per_second": 13.551, + "eval_steps_per_second": 1.71, + "step": 2383 + }, + { + "epoch": 2.005874947545111, + "grad_norm": 0.3563518226146698, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2390 + }, + { + "epoch": 2.014267729752413, + "grad_norm": 0.3913732171058655, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 2400 + }, + { + "epoch": 2.0226605119597147, + "grad_norm": 0.3511047661304474, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 2410 + }, + { + "epoch": 2.0310532941670165, + "grad_norm": 0.3917897641658783, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 2420 + }, + { + "epoch": 2.0394460763743183, + "grad_norm": 0.36766913533210754, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2430 + }, + { + "epoch": 2.0478388585816196, + "grad_norm": 0.434097021818161, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 2440 + }, + { + "epoch": 2.0562316407889214, + "grad_norm": 0.4986756145954132, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 2450 + }, + { + "epoch": 2.064624422996223, + "grad_norm": 0.4377020001411438, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 2460 + }, + { + "epoch": 2.073017205203525, + "grad_norm": 0.4412095546722412, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2470 + }, + { + "epoch": 2.0814099874108267, + "grad_norm": 0.4463737905025482, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 2480 + }, + { + "epoch": 2.0898027696181285, + "grad_norm": 0.4118853211402893, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 2490 + }, + { + "epoch": 2.0981955518254303, + "grad_norm": 0.48814308643341064, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 2500 + }, + { + "epoch": 2.106588334032732, + "grad_norm": 0.4263038635253906, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2510 + }, + { + "epoch": 2.1149811162400334, + "grad_norm": 0.41060999035835266, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2520 + }, + { + "epoch": 2.123373898447335, + "grad_norm": 0.4699285626411438, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 2530 + }, + { + "epoch": 2.131766680654637, + "grad_norm": 0.4321298897266388, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2540 + }, + { + "epoch": 2.1401594628619387, + "grad_norm": 0.41544368863105774, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 2550 + }, + { + "epoch": 2.1485522450692405, + "grad_norm": 0.4529191851615906, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2560 + }, + { + "epoch": 2.1569450272765422, + "grad_norm": 0.4370215833187103, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2570 + }, + { + "epoch": 2.165337809483844, + "grad_norm": 0.3878629207611084, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 2580 + }, + { + "epoch": 2.173730591691146, + "grad_norm": 0.47374191880226135, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2590 + }, + { + "epoch": 2.182123373898447, + "grad_norm": 0.4551556706428528, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 2600 + }, + { + "epoch": 2.190516156105749, + "grad_norm": 0.45371633768081665, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 2610 + }, + { + "epoch": 2.1989089383130507, + "grad_norm": 0.3831859529018402, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2620 + }, + { + "epoch": 2.2073017205203525, + "grad_norm": 0.42436569929122925, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2630 + }, + { + "epoch": 2.2156945027276542, + "grad_norm": 0.4363750219345093, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2640 + }, + { + "epoch": 2.224087284934956, + "grad_norm": 0.4473390579223633, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 2650 + }, + { + "epoch": 2.2324800671422578, + "grad_norm": 0.4419533908367157, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 2660 + }, + { + "epoch": 2.2408728493495595, + "grad_norm": 0.525901198387146, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 2670 + }, + { + "epoch": 2.2492656315568613, + "grad_norm": 0.4345211684703827, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 2680 + }, + { + "epoch": 2.2576584137641627, + "grad_norm": 0.5169841051101685, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 2690 + }, + { + "epoch": 2.2660511959714644, + "grad_norm": 0.43511003255844116, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2700 + }, + { + "epoch": 2.274443978178766, + "grad_norm": 0.4781411588191986, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2710 + }, + { + "epoch": 2.282836760386068, + "grad_norm": 0.4282242953777313, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2720 + }, + { + "epoch": 2.2912295425933698, + "grad_norm": 0.4499875605106354, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 2730 + }, + { + "epoch": 2.2996223248006715, + "grad_norm": 0.4133218824863434, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2740 + }, + { + "epoch": 2.3080151070079733, + "grad_norm": 0.4706156849861145, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2750 + }, + { + "epoch": 2.3164078892152746, + "grad_norm": 0.4537484347820282, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.3248006714225764, + "grad_norm": 0.39736735820770264, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2770 + }, + { + "epoch": 2.333193453629878, + "grad_norm": 0.4488453269004822, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2780 + }, + { + "epoch": 2.34158623583718, + "grad_norm": 0.44405487179756165, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2790 + }, + { + "epoch": 2.3499790180444817, + "grad_norm": 0.4726555049419403, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 2800 + }, + { + "epoch": 2.3583718002517835, + "grad_norm": 0.4820375442504883, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2810 + }, + { + "epoch": 2.3667645824590853, + "grad_norm": 0.46176597476005554, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 2820 + }, + { + "epoch": 2.375157364666387, + "grad_norm": 0.4603394567966461, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 2830 + }, + { + "epoch": 2.383550146873689, + "grad_norm": 0.4462946355342865, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 2840 + }, + { + "epoch": 2.39194292908099, + "grad_norm": 0.5216080546379089, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 2850 + }, + { + "epoch": 2.400335711288292, + "grad_norm": 0.44553086161613464, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2860 + }, + { + "epoch": 2.4087284934955937, + "grad_norm": 0.4215725362300873, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2870 + }, + { + "epoch": 2.4171212757028955, + "grad_norm": 0.4646450877189636, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2880 + }, + { + "epoch": 2.4255140579101973, + "grad_norm": 0.44749370217323303, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2890 + }, + { + "epoch": 2.433906840117499, + "grad_norm": 0.4986693859100342, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2900 + }, + { + "epoch": 2.442299622324801, + "grad_norm": 0.4607609808444977, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 2910 + }, + { + "epoch": 2.4506924045321026, + "grad_norm": 0.4597654938697815, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 2920 + }, + { + "epoch": 2.4590851867394043, + "grad_norm": 0.4106820821762085, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 2930 + }, + { + "epoch": 2.4674779689467057, + "grad_norm": 0.4531514048576355, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 2940 + }, + { + "epoch": 2.4758707511540075, + "grad_norm": 0.4546769857406616, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2950 + }, + { + "epoch": 2.4842635333613092, + "grad_norm": 0.47410622239112854, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 2960 + }, + { + "epoch": 2.492656315568611, + "grad_norm": 0.4498177468776703, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 2970 + }, + { + "epoch": 2.5010490977759128, + "grad_norm": 0.47267791628837585, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 2980 + }, + { + "epoch": 2.5094418799832146, + "grad_norm": 0.4340207576751709, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 2990 + }, + { + "epoch": 2.5178346621905163, + "grad_norm": 0.43454936146736145, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3000 + }, + { + "epoch": 2.5262274443978177, + "grad_norm": 0.43459394574165344, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3010 + }, + { + "epoch": 2.5346202266051194, + "grad_norm": 0.4716770052909851, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.543013008812421, + "grad_norm": 0.4339194595813751, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 3030 + }, + { + "epoch": 2.551405791019723, + "grad_norm": 0.4655593931674957, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3040 + }, + { + "epoch": 2.5597985732270248, + "grad_norm": 0.5480475425720215, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3050 + }, + { + "epoch": 2.5681913554343265, + "grad_norm": 0.4783174991607666, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3060 + }, + { + "epoch": 2.5765841376416283, + "grad_norm": 0.45062026381492615, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 3070 + }, + { + "epoch": 2.58497691984893, + "grad_norm": 0.4559392035007477, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 3080 + }, + { + "epoch": 2.593369702056232, + "grad_norm": 0.6581618785858154, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 3090 + }, + { + "epoch": 2.601762484263533, + "grad_norm": 0.48549333214759827, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 3100 + }, + { + "epoch": 2.610155266470835, + "grad_norm": 0.5358436107635498, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 3110 + }, + { + "epoch": 2.6185480486781367, + "grad_norm": 0.5380043983459473, + "learning_rate": 0.0002, + "loss": 1.6507, + "step": 3120 + }, + { + "epoch": 2.6269408308854385, + "grad_norm": 0.49887847900390625, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 3130 + }, + { + "epoch": 2.6353336130927403, + "grad_norm": 0.46039602160453796, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 3140 + }, + { + "epoch": 2.643726395300042, + "grad_norm": 0.416098952293396, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3150 + }, + { + "epoch": 2.652119177507344, + "grad_norm": 0.465326726436615, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 3160 + }, + { + "epoch": 2.660511959714645, + "grad_norm": 0.47029924392700195, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3170 + }, + { + "epoch": 2.6689047419219474, + "grad_norm": 0.5063307285308838, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 3180 + }, + { + "epoch": 2.6772975241292487, + "grad_norm": 0.42928868532180786, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3190 + }, + { + "epoch": 2.6856903063365505, + "grad_norm": 0.4170134365558624, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3200 + }, + { + "epoch": 2.6940830885438523, + "grad_norm": 0.47810474038124084, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3210 + }, + { + "epoch": 2.702475870751154, + "grad_norm": 0.44440609216690063, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 3220 + }, + { + "epoch": 2.710868652958456, + "grad_norm": 0.482759565114975, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 3230 + }, + { + "epoch": 2.7192614351657576, + "grad_norm": 0.4325942099094391, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 3240 + }, + { + "epoch": 2.7276542173730594, + "grad_norm": 0.502498984336853, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 3250 + }, + { + "epoch": 2.7360469995803607, + "grad_norm": 0.4725162982940674, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3260 + }, + { + "epoch": 2.7444397817876625, + "grad_norm": 0.46781349182128906, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 3270 + }, + { + "epoch": 2.7528325639949642, + "grad_norm": 0.47366851568222046, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3280 + }, + { + "epoch": 2.761225346202266, + "grad_norm": 0.5101882815361023, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 3290 + }, + { + "epoch": 2.769618128409568, + "grad_norm": 0.4874587059020996, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 3300 + }, + { + "epoch": 2.7780109106168696, + "grad_norm": 0.4989369213581085, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3310 + }, + { + "epoch": 2.7864036928241713, + "grad_norm": 0.48041442036628723, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 3320 + }, + { + "epoch": 2.7947964750314727, + "grad_norm": 0.4845651090145111, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 3330 + }, + { + "epoch": 2.803189257238775, + "grad_norm": 0.48575496673583984, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 3340 + }, + { + "epoch": 2.811582039446076, + "grad_norm": 0.509726881980896, + "learning_rate": 0.0002, + "loss": 1.6771, + "step": 3350 + }, + { + "epoch": 2.819974821653378, + "grad_norm": 0.5026665329933167, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 3360 + }, + { + "epoch": 2.8283676038606798, + "grad_norm": 0.4727601706981659, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 3370 + }, + { + "epoch": 2.8367603860679815, + "grad_norm": 0.41952234506607056, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 3380 + }, + { + "epoch": 2.8451531682752833, + "grad_norm": 0.49663856625556946, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 3390 + }, + { + "epoch": 2.853545950482585, + "grad_norm": 0.4934511184692383, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 3400 + }, + { + "epoch": 2.861938732689887, + "grad_norm": 0.4673226773738861, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3410 + }, + { + "epoch": 2.870331514897188, + "grad_norm": 0.48972779512405396, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 3420 + }, + { + "epoch": 2.8787242971044904, + "grad_norm": 0.5008330345153809, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3430 + }, + { + "epoch": 2.8871170793117917, + "grad_norm": 0.43337664008140564, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 3440 + }, + { + "epoch": 2.8955098615190935, + "grad_norm": 0.4430622458457947, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 3450 + }, + { + "epoch": 2.9039026437263953, + "grad_norm": 0.45123326778411865, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3460 + }, + { + "epoch": 2.912295425933697, + "grad_norm": 0.47367340326309204, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 3470 + }, + { + "epoch": 2.920688208140999, + "grad_norm": 0.44940701127052307, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3480 + }, + { + "epoch": 2.9290809903483006, + "grad_norm": 0.44216281175613403, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3490 + }, + { + "epoch": 2.9374737725556024, + "grad_norm": 0.4824782609939575, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3500 + }, + { + "epoch": 2.9458665547629037, + "grad_norm": 0.43067067861557007, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 3510 + }, + { + "epoch": 2.9542593369702055, + "grad_norm": 0.46483176946640015, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3520 + }, + { + "epoch": 2.9626521191775073, + "grad_norm": 0.49230799078941345, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3530 + }, + { + "epoch": 2.971044901384809, + "grad_norm": 0.5081011652946472, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3540 + }, + { + "epoch": 2.979437683592111, + "grad_norm": 0.5326072573661804, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 3550 + }, + { + "epoch": 2.9878304657994126, + "grad_norm": 0.4981454014778137, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3560 + }, + { + "epoch": 2.9962232480067144, + "grad_norm": 0.4330528676509857, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3570 + }, + { + "epoch": 2.999580360889635, + "eval_loss": 1.824695348739624, + "eval_runtime": 37.947, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3574 + }, + { + "epoch": 3.004616030214016, + "grad_norm": 0.4380604326725006, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 3580 + }, + { + "epoch": 3.0130088124213175, + "grad_norm": 0.5375564098358154, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 3590 + }, + { + "epoch": 3.0214015946286192, + "grad_norm": 0.50722736120224, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3600 + }, + { + "epoch": 3.029794376835921, + "grad_norm": 0.5398766994476318, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 3610 + }, + { + "epoch": 3.038187159043223, + "grad_norm": 0.520709753036499, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 3620 + }, + { + "epoch": 3.0465799412505246, + "grad_norm": 0.5429664850234985, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 3630 + }, + { + "epoch": 3.0549727234578263, + "grad_norm": 0.5634943842887878, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 3640 + }, + { + "epoch": 3.063365505665128, + "grad_norm": 0.5042277574539185, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 3650 + }, + { + "epoch": 3.07175828787243, + "grad_norm": 0.5778711438179016, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 3660 + }, + { + "epoch": 3.080151070079731, + "grad_norm": 0.5504926443099976, + "learning_rate": 0.0002, + "loss": 1.5196, + "step": 3670 + }, + { + "epoch": 3.088543852287033, + "grad_norm": 0.5199463963508606, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3680 + }, + { + "epoch": 3.0969366344943348, + "grad_norm": 0.552334189414978, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 3690 + }, + { + "epoch": 3.1053294167016365, + "grad_norm": 0.5650873780250549, + "learning_rate": 0.0002, + "loss": 1.4638, + "step": 3700 + }, + { + "epoch": 3.1137221989089383, + "grad_norm": 0.6292349696159363, + "learning_rate": 0.0002, + "loss": 1.4945, + "step": 3710 + }, + { + "epoch": 3.12211498111624, + "grad_norm": 0.5523604154586792, + "learning_rate": 0.0002, + "loss": 1.4787, + "step": 3720 + }, + { + "epoch": 3.130507763323542, + "grad_norm": 0.6160100698471069, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3730 + }, + { + "epoch": 3.1389005455308436, + "grad_norm": 0.6091629266738892, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3740 + }, + { + "epoch": 3.1472933277381454, + "grad_norm": 0.5695531964302063, + "learning_rate": 0.0002, + "loss": 1.4659, + "step": 3750 + }, + { + "epoch": 3.1556861099454467, + "grad_norm": 0.569611132144928, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 3760 + }, + { + "epoch": 3.1640788921527485, + "grad_norm": 0.5761140584945679, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 3770 + }, + { + "epoch": 3.1724716743600503, + "grad_norm": 0.6855548620223999, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 3780 + }, + { + "epoch": 3.180864456567352, + "grad_norm": 0.5815101265907288, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 3790 + }, + { + "epoch": 3.189257238774654, + "grad_norm": 0.6179960370063782, + "learning_rate": 0.0002, + "loss": 1.5289, + "step": 3800 + }, + { + "epoch": 3.1976500209819556, + "grad_norm": 0.5418674349784851, + "learning_rate": 0.0002, + "loss": 1.4833, + "step": 3810 + }, + { + "epoch": 3.2060428031892574, + "grad_norm": 0.5655816197395325, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 3820 + }, + { + "epoch": 3.214435585396559, + "grad_norm": 0.7279291152954102, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 3830 + }, + { + "epoch": 3.2228283676038605, + "grad_norm": 0.490998238325119, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 3840 + }, + { + "epoch": 3.2312211498111623, + "grad_norm": 0.6065797209739685, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 3850 + }, + { + "epoch": 3.239613932018464, + "grad_norm": 0.6024682521820068, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3860 + }, + { + "epoch": 3.248006714225766, + "grad_norm": 0.5571125745773315, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 3870 + }, + { + "epoch": 3.2563994964330676, + "grad_norm": 0.5662134289741516, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3880 + }, + { + "epoch": 3.2647922786403694, + "grad_norm": 0.5936661958694458, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 3890 + }, + { + "epoch": 3.273185060847671, + "grad_norm": 0.6739671230316162, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3900 + }, + { + "epoch": 3.281577843054973, + "grad_norm": 0.5579532384872437, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3910 + }, + { + "epoch": 3.2899706252622742, + "grad_norm": 0.6595954298973083, + "learning_rate": 0.0002, + "loss": 1.4788, + "step": 3920 + }, + { + "epoch": 3.298363407469576, + "grad_norm": 0.5712262988090515, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3930 + }, + { + "epoch": 3.306756189676878, + "grad_norm": 0.5601761341094971, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 3940 + }, + { + "epoch": 3.3151489718841796, + "grad_norm": 0.5759967565536499, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3950 + }, + { + "epoch": 3.3235417540914813, + "grad_norm": 0.6543047428131104, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 3960 + }, + { + "epoch": 3.331934536298783, + "grad_norm": 0.6355253458023071, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 3970 + }, + { + "epoch": 3.340327318506085, + "grad_norm": 0.5671007633209229, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3980 + }, + { + "epoch": 3.3487201007133867, + "grad_norm": 0.6743636727333069, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 3990 + }, + { + "epoch": 3.3571128829206884, + "grad_norm": 0.500627338886261, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 4000 + }, + { + "epoch": 3.3655056651279898, + "grad_norm": 0.5666340589523315, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4010 + }, + { + "epoch": 3.3738984473352915, + "grad_norm": 0.5651408433914185, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 4020 + }, + { + "epoch": 3.3822912295425933, + "grad_norm": 0.6338897943496704, + "learning_rate": 0.0002, + "loss": 1.4907, + "step": 4030 + }, + { + "epoch": 3.390684011749895, + "grad_norm": 0.5781935453414917, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 4040 + }, + { + "epoch": 3.399076793957197, + "grad_norm": 0.55543053150177, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 4050 + }, + { + "epoch": 3.4074695761644986, + "grad_norm": 0.6602614521980286, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4060 + }, + { + "epoch": 3.4158623583718004, + "grad_norm": 0.5514156222343445, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 4070 + }, + { + "epoch": 3.4242551405791017, + "grad_norm": 0.5760560035705566, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 4080 + }, + { + "epoch": 3.4326479227864035, + "grad_norm": 0.657503604888916, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4090 + }, + { + "epoch": 3.4410407049937053, + "grad_norm": 0.5746736526489258, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4100 + }, + { + "epoch": 3.449433487201007, + "grad_norm": 0.5988999009132385, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 4110 + }, + { + "epoch": 3.457826269408309, + "grad_norm": 0.7294586300849915, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4120 + }, + { + "epoch": 3.4662190516156106, + "grad_norm": 0.6391161680221558, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 4130 + }, + { + "epoch": 3.4746118338229124, + "grad_norm": 0.6416470408439636, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4140 + }, + { + "epoch": 3.483004616030214, + "grad_norm": 0.5710626244544983, + "learning_rate": 0.0002, + "loss": 1.5587, + "step": 4150 + }, + { + "epoch": 3.491397398237516, + "grad_norm": 0.5370054841041565, + "learning_rate": 0.0002, + "loss": 1.4661, + "step": 4160 + }, + { + "epoch": 3.4997901804448173, + "grad_norm": 0.5559558272361755, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 4170 + }, + { + "epoch": 3.508182962652119, + "grad_norm": 0.5426168441772461, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 4180 + }, + { + "epoch": 3.516575744859421, + "grad_norm": 0.5997438430786133, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 4190 + }, + { + "epoch": 3.5249685270667226, + "grad_norm": 0.5399143099784851, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4200 + }, + { + "epoch": 3.5333613092740244, + "grad_norm": 0.6341416239738464, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 4210 + }, + { + "epoch": 3.541754091481326, + "grad_norm": 0.632238507270813, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 4220 + }, + { + "epoch": 3.550146873688628, + "grad_norm": 0.6356478333473206, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 4230 + }, + { + "epoch": 3.5585396558959292, + "grad_norm": 0.6379408240318298, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 4240 + }, + { + "epoch": 3.5669324381032315, + "grad_norm": 0.6265586018562317, + "learning_rate": 0.0002, + "loss": 1.5184, + "step": 4250 + }, + { + "epoch": 3.575325220310533, + "grad_norm": 0.5378820896148682, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 4260 + }, + { + "epoch": 3.5837180025178346, + "grad_norm": 0.6800801753997803, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4270 + }, + { + "epoch": 3.5921107847251363, + "grad_norm": 0.5653113126754761, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4280 + }, + { + "epoch": 3.600503566932438, + "grad_norm": 0.548647940158844, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 4290 + }, + { + "epoch": 3.60889634913974, + "grad_norm": 0.5729944705963135, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4300 + }, + { + "epoch": 3.6172891313470417, + "grad_norm": 0.6204999685287476, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 4310 + }, + { + "epoch": 3.6256819135543434, + "grad_norm": 0.6275812983512878, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 4320 + }, + { + "epoch": 3.6340746957616448, + "grad_norm": 0.7261835336685181, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 4330 + }, + { + "epoch": 3.6424674779689465, + "grad_norm": 0.6048004627227783, + "learning_rate": 0.0002, + "loss": 1.5128, + "step": 4340 + }, + { + "epoch": 3.6508602601762483, + "grad_norm": 0.5879671573638916, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 4350 + }, + { + "epoch": 3.65925304238355, + "grad_norm": 0.6001018285751343, + "learning_rate": 0.0002, + "loss": 1.5477, + "step": 4360 + }, + { + "epoch": 3.667645824590852, + "grad_norm": 0.6468151211738586, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4370 + }, + { + "epoch": 3.6760386067981536, + "grad_norm": 0.6342051029205322, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 4380 + }, + { + "epoch": 3.6844313890054554, + "grad_norm": 0.6078384518623352, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 4390 + }, + { + "epoch": 3.692824171212757, + "grad_norm": 0.5555588006973267, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 4400 + }, + { + "epoch": 3.701216953420059, + "grad_norm": 0.6089665293693542, + "learning_rate": 0.0002, + "loss": 1.5694, + "step": 4410 + }, + { + "epoch": 3.7096097356273603, + "grad_norm": 0.6225191950798035, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 4420 + }, + { + "epoch": 3.718002517834662, + "grad_norm": 0.5642715692520142, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 4430 + }, + { + "epoch": 3.726395300041964, + "grad_norm": 0.5703449845314026, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 4440 + }, + { + "epoch": 3.7347880822492656, + "grad_norm": 0.6029745936393738, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 4450 + }, + { + "epoch": 3.7431808644565674, + "grad_norm": 0.7089189887046814, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4460 + }, + { + "epoch": 3.751573646663869, + "grad_norm": 0.6230936050415039, + "learning_rate": 0.0002, + "loss": 1.4804, + "step": 4470 + }, + { + "epoch": 3.759966428871171, + "grad_norm": 0.5718494653701782, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 4480 + }, + { + "epoch": 3.7683592110784723, + "grad_norm": 0.5404117703437805, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 4490 + }, + { + "epoch": 3.7767519932857745, + "grad_norm": 0.5816529393196106, + "learning_rate": 0.0002, + "loss": 1.4707, + "step": 4500 + }, + { + "epoch": 3.785144775493076, + "grad_norm": 0.6314901113510132, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 4510 + }, + { + "epoch": 3.7935375577003776, + "grad_norm": 0.7639698386192322, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 4520 + }, + { + "epoch": 3.8019303399076794, + "grad_norm": 0.5727366209030151, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4530 + }, + { + "epoch": 3.810323122114981, + "grad_norm": 0.6467128396034241, + "learning_rate": 0.0002, + "loss": 1.5409, + "step": 4540 + }, + { + "epoch": 3.818715904322283, + "grad_norm": 0.6572837233543396, + "learning_rate": 0.0002, + "loss": 1.5266, + "step": 4550 + }, + { + "epoch": 3.8271086865295847, + "grad_norm": 0.5847418904304504, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4560 + }, + { + "epoch": 3.8355014687368865, + "grad_norm": 0.48820871114730835, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 4570 + }, + { + "epoch": 3.843894250944188, + "grad_norm": 1.2537429332733154, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 4580 + }, + { + "epoch": 3.8522870331514896, + "grad_norm": 0.6026989221572876, + "learning_rate": 0.0002, + "loss": 1.5522, + "step": 4590 + }, + { + "epoch": 3.8606798153587913, + "grad_norm": 0.5541417598724365, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 4600 + }, + { + "epoch": 3.869072597566093, + "grad_norm": 0.7668771147727966, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 4610 + }, + { + "epoch": 3.877465379773395, + "grad_norm": 0.6181227564811707, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4620 + }, + { + "epoch": 3.8858581619806967, + "grad_norm": 0.5842700004577637, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4630 + }, + { + "epoch": 3.8942509441879984, + "grad_norm": 0.5824751257896423, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 4640 + }, + { + "epoch": 3.9026437263952998, + "grad_norm": 0.6212735772132874, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4650 + }, + { + "epoch": 3.911036508602602, + "grad_norm": 0.6123346090316772, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 4660 + }, + { + "epoch": 3.9194292908099033, + "grad_norm": 0.518662691116333, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4670 + }, + { + "epoch": 3.927822073017205, + "grad_norm": 0.6963476538658142, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4680 + }, + { + "epoch": 3.936214855224507, + "grad_norm": 0.5192152261734009, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4690 + }, + { + "epoch": 3.9446076374318086, + "grad_norm": 0.5820888876914978, + "learning_rate": 0.0002, + "loss": 1.5312, + "step": 4700 + }, + { + "epoch": 3.9530004196391104, + "grad_norm": 0.6320387721061707, + "learning_rate": 0.0002, + "loss": 1.527, + "step": 4710 + }, + { + "epoch": 3.961393201846412, + "grad_norm": 0.6174548268318176, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 4720 + }, + { + "epoch": 3.969785984053714, + "grad_norm": 0.6691966652870178, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 4730 + }, + { + "epoch": 3.9781787662610153, + "grad_norm": 0.5972068309783936, + "learning_rate": 0.0002, + "loss": 1.4762, + "step": 4740 + }, + { + "epoch": 3.9865715484683175, + "grad_norm": 0.5759536027908325, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4750 + }, + { + "epoch": 3.994964330675619, + "grad_norm": 0.5886756777763367, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 4760 + }, + { + "epoch": 4.0, + "eval_loss": 1.8749940395355225, + "eval_runtime": 38.037, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 4766 + }, + { + "epoch": 4.003357112882921, + "grad_norm": 0.5915011167526245, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 4770 + }, + { + "epoch": 4.011749895090222, + "grad_norm": 0.8565000891685486, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 4780 + }, + { + "epoch": 4.020142677297524, + "grad_norm": 0.7753950953483582, + "learning_rate": 0.0002, + "loss": 1.3211, + "step": 4790 + }, + { + "epoch": 4.028535459504826, + "grad_norm": 0.6837254166603088, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 4800 + }, + { + "epoch": 4.036928241712127, + "grad_norm": 0.8374526500701904, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 4810 + }, + { + "epoch": 4.0453210239194295, + "grad_norm": 0.8717963099479675, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 4820 + }, + { + "epoch": 4.053713806126731, + "grad_norm": 0.7002043724060059, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4830 + }, + { + "epoch": 4.062106588334033, + "grad_norm": 1.0319572687149048, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 4840 + }, + { + "epoch": 4.070499370541334, + "grad_norm": 0.6746882200241089, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 4850 + }, + { + "epoch": 4.078892152748637, + "grad_norm": 0.8187578320503235, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 4860 + }, + { + "epoch": 4.087284934955938, + "grad_norm": 0.7888399362564087, + "learning_rate": 0.0002, + "loss": 1.368, + "step": 4870 + }, + { + "epoch": 4.095677717163239, + "grad_norm": 0.7149351239204407, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4880 + }, + { + "epoch": 4.1040704993705415, + "grad_norm": 0.9067983031272888, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 4890 + }, + { + "epoch": 4.112463281577843, + "grad_norm": 0.771186351776123, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 4900 + }, + { + "epoch": 4.120856063785145, + "grad_norm": 0.7756485342979431, + "learning_rate": 0.0002, + "loss": 1.2722, + "step": 4910 + }, + { + "epoch": 4.129248845992446, + "grad_norm": 0.7149116396903992, + "learning_rate": 0.0002, + "loss": 1.4138, + "step": 4920 + }, + { + "epoch": 4.137641628199749, + "grad_norm": 0.700442910194397, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4930 + }, + { + "epoch": 4.14603441040705, + "grad_norm": 0.8439189195632935, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4940 + }, + { + "epoch": 4.154427192614351, + "grad_norm": 0.6570779085159302, + "learning_rate": 0.0002, + "loss": 1.3511, + "step": 4950 + }, + { + "epoch": 4.1628199748216534, + "grad_norm": 0.886482298374176, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 4960 + }, + { + "epoch": 4.171212757028955, + "grad_norm": 0.7220938801765442, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 4970 + }, + { + "epoch": 4.179605539236257, + "grad_norm": 0.7185905575752258, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 4980 + }, + { + "epoch": 4.187998321443558, + "grad_norm": 0.7566333413124084, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 4990 + }, + { + "epoch": 4.1963911036508605, + "grad_norm": 0.6960445642471313, + "learning_rate": 0.0002, + "loss": 1.2771, + "step": 5000 + }, + { + "epoch": 4.204783885858162, + "grad_norm": 0.7727336883544922, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5010 + }, + { + "epoch": 4.213176668065464, + "grad_norm": 0.8038365244865417, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 5020 + }, + { + "epoch": 4.221569450272765, + "grad_norm": 0.7587628364562988, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5030 + }, + { + "epoch": 4.229962232480067, + "grad_norm": 0.928032398223877, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 5040 + }, + { + "epoch": 4.238355014687369, + "grad_norm": 0.7168642282485962, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5050 + }, + { + "epoch": 4.24674779689467, + "grad_norm": 0.7981422543525696, + "learning_rate": 0.0002, + "loss": 1.3767, + "step": 5060 + }, + { + "epoch": 4.2551405791019725, + "grad_norm": 0.6951150894165039, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5070 + }, + { + "epoch": 4.263533361309274, + "grad_norm": 0.7337371706962585, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 5080 + }, + { + "epoch": 4.271926143516576, + "grad_norm": 0.8367464542388916, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5090 + }, + { + "epoch": 4.280318925723877, + "grad_norm": 0.6744083166122437, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 5100 + }, + { + "epoch": 4.28871170793118, + "grad_norm": 0.9072301387786865, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 5110 + }, + { + "epoch": 4.297104490138481, + "grad_norm": 0.7703930735588074, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 5120 + }, + { + "epoch": 4.305497272345782, + "grad_norm": 0.6734083294868469, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5130 + }, + { + "epoch": 4.3138900545530845, + "grad_norm": 0.7835540175437927, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 5140 + }, + { + "epoch": 4.322282836760386, + "grad_norm": 1.0822200775146484, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 5150 + }, + { + "epoch": 4.330675618967688, + "grad_norm": 0.8432536721229553, + "learning_rate": 0.0002, + "loss": 1.4167, + "step": 5160 + }, + { + "epoch": 4.339068401174989, + "grad_norm": 0.6739283800125122, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 5170 + }, + { + "epoch": 4.347461183382292, + "grad_norm": 0.7395278811454773, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 5180 + }, + { + "epoch": 4.355853965589593, + "grad_norm": 0.7638891339302063, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 5190 + }, + { + "epoch": 4.364246747796894, + "grad_norm": 1.1222662925720215, + "learning_rate": 0.0002, + "loss": 1.34, + "step": 5200 + }, + { + "epoch": 4.3726395300041965, + "grad_norm": 0.9102525115013123, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5210 + }, + { + "epoch": 4.381032312211498, + "grad_norm": 0.7181593775749207, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 5220 + }, + { + "epoch": 4.3894250944188, + "grad_norm": 0.7813979387283325, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 5230 + }, + { + "epoch": 4.397817876626101, + "grad_norm": 0.8906185626983643, + "learning_rate": 0.0002, + "loss": 1.423, + "step": 5240 + }, + { + "epoch": 4.406210658833404, + "grad_norm": 0.7456443309783936, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5250 + }, + { + "epoch": 4.414603441040705, + "grad_norm": 0.8752070069313049, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 5260 + }, + { + "epoch": 4.422996223248007, + "grad_norm": 0.9560954570770264, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 5270 + }, + { + "epoch": 4.4313890054553084, + "grad_norm": 0.7227762341499329, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 5280 + }, + { + "epoch": 4.43978178766261, + "grad_norm": 0.8141599893569946, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 5290 + }, + { + "epoch": 4.448174569869912, + "grad_norm": 0.928382158279419, + "learning_rate": 0.0002, + "loss": 1.381, + "step": 5300 + }, + { + "epoch": 4.456567352077213, + "grad_norm": 0.7719997763633728, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5310 + }, + { + "epoch": 4.4649601342845155, + "grad_norm": 0.8081879615783691, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 5320 + }, + { + "epoch": 4.473352916491817, + "grad_norm": 0.7903412580490112, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 5330 + }, + { + "epoch": 4.481745698699119, + "grad_norm": 0.7751287221908569, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5340 + }, + { + "epoch": 4.49013848090642, + "grad_norm": 0.8287544250488281, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5350 + }, + { + "epoch": 4.498531263113723, + "grad_norm": 0.7431012392044067, + "learning_rate": 0.0002, + "loss": 1.3841, + "step": 5360 + }, + { + "epoch": 4.506924045321024, + "grad_norm": 0.8648661971092224, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5370 + }, + { + "epoch": 4.515316827528325, + "grad_norm": 0.9314997792243958, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5380 + }, + { + "epoch": 4.5237096097356275, + "grad_norm": 0.7530864477157593, + "learning_rate": 0.0002, + "loss": 1.354, + "step": 5390 + }, + { + "epoch": 4.532102391942929, + "grad_norm": 0.8739821910858154, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 5400 + }, + { + "epoch": 4.540495174150231, + "grad_norm": 0.8090344667434692, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 4.548887956357532, + "grad_norm": 0.7530879974365234, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 5420 + }, + { + "epoch": 4.557280738564835, + "grad_norm": 0.8787251114845276, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 5430 + }, + { + "epoch": 4.565673520772136, + "grad_norm": 0.813961923122406, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 5440 + }, + { + "epoch": 4.574066302979437, + "grad_norm": 0.7778232097625732, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 5450 + }, + { + "epoch": 4.5824590851867395, + "grad_norm": 0.7323020696640015, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5460 + }, + { + "epoch": 4.590851867394041, + "grad_norm": 0.7826765179634094, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5470 + }, + { + "epoch": 4.599244649601343, + "grad_norm": 0.7245969772338867, + "learning_rate": 0.0002, + "loss": 1.4068, + "step": 5480 + }, + { + "epoch": 4.607637431808644, + "grad_norm": 0.7697308659553528, + "learning_rate": 0.0002, + "loss": 1.4276, + "step": 5490 + }, + { + "epoch": 4.616030214015947, + "grad_norm": 0.8053571581840515, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5500 + }, + { + "epoch": 4.624422996223248, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 5510 + }, + { + "epoch": 4.632815778430549, + "grad_norm": 0.7398585677146912, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 5520 + }, + { + "epoch": 4.6412085606378515, + "grad_norm": 0.7896319031715393, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 5530 + }, + { + "epoch": 4.649601342845153, + "grad_norm": 0.8290980458259583, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5540 + }, + { + "epoch": 4.657994125052455, + "grad_norm": 0.8232647776603699, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 5550 + }, + { + "epoch": 4.666386907259756, + "grad_norm": 0.9154987335205078, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5560 + }, + { + "epoch": 4.674779689467059, + "grad_norm": 0.8400886654853821, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 5570 + }, + { + "epoch": 4.68317247167436, + "grad_norm": 0.7312718629837036, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 5580 + }, + { + "epoch": 4.691565253881662, + "grad_norm": 0.8043803572654724, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5590 + }, + { + "epoch": 4.6999580360889635, + "grad_norm": 0.7966225147247314, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5600 + }, + { + "epoch": 4.708350818296266, + "grad_norm": 0.881574809551239, + "learning_rate": 0.0002, + "loss": 1.3429, + "step": 5610 + }, + { + "epoch": 4.716743600503567, + "grad_norm": 0.7252084016799927, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5620 + }, + { + "epoch": 4.725136382710868, + "grad_norm": 0.7726518511772156, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 5630 + }, + { + "epoch": 4.7335291649181706, + "grad_norm": 0.7306379079818726, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5640 + }, + { + "epoch": 4.741921947125472, + "grad_norm": 0.8029969334602356, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 5650 + }, + { + "epoch": 4.750314729332774, + "grad_norm": 0.9103893637657166, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 5660 + }, + { + "epoch": 4.758707511540075, + "grad_norm": 0.8783416748046875, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5670 + }, + { + "epoch": 4.767100293747378, + "grad_norm": 0.6807119846343994, + "learning_rate": 0.0002, + "loss": 1.3427, + "step": 5680 + }, + { + "epoch": 4.775493075954679, + "grad_norm": 0.7103772759437561, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 5690 + }, + { + "epoch": 4.78388585816198, + "grad_norm": 0.8472093343734741, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 5700 + }, + { + "epoch": 4.7922786403692825, + "grad_norm": 0.851847231388092, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 5710 + }, + { + "epoch": 4.800671422576584, + "grad_norm": 0.9084636569023132, + "learning_rate": 0.0002, + "loss": 1.3965, + "step": 5720 + }, + { + "epoch": 4.809064204783886, + "grad_norm": 0.7628585696220398, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 5730 + }, + { + "epoch": 4.817456986991187, + "grad_norm": 0.775580883026123, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 5740 + }, + { + "epoch": 4.82584976919849, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 1.4573, + "step": 5750 + }, + { + "epoch": 4.834242551405791, + "grad_norm": 0.7021728754043579, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 5760 + }, + { + "epoch": 4.842635333613092, + "grad_norm": 0.7810541391372681, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5770 + }, + { + "epoch": 4.8510281158203945, + "grad_norm": 0.7290041446685791, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5780 + }, + { + "epoch": 4.859420898027696, + "grad_norm": 0.9059709906578064, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5790 + }, + { + "epoch": 4.867813680234998, + "grad_norm": 0.8338062167167664, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 5800 + }, + { + "epoch": 4.876206462442299, + "grad_norm": 0.830926775932312, + "learning_rate": 0.0002, + "loss": 1.395, + "step": 5810 + }, + { + "epoch": 4.884599244649602, + "grad_norm": 0.7818633317947388, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 5820 + }, + { + "epoch": 4.892992026856903, + "grad_norm": 0.8143376708030701, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5830 + }, + { + "epoch": 4.901384809064205, + "grad_norm": 0.7754496335983276, + "learning_rate": 0.0002, + "loss": 1.3583, + "step": 5840 + }, + { + "epoch": 4.9097775912715065, + "grad_norm": 0.7154468297958374, + "learning_rate": 0.0002, + "loss": 1.4036, + "step": 5850 + }, + { + "epoch": 4.918170373478809, + "grad_norm": 0.6829783916473389, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 5860 + }, + { + "epoch": 4.92656315568611, + "grad_norm": 0.784919261932373, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5870 + }, + { + "epoch": 4.934955937893411, + "grad_norm": 0.8168354034423828, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5880 + }, + { + "epoch": 4.943348720100714, + "grad_norm": 0.7356618642807007, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 5890 + }, + { + "epoch": 4.951741502308015, + "grad_norm": 0.7399224042892456, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 5900 + }, + { + "epoch": 4.960134284515317, + "grad_norm": 0.7430436015129089, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 5910 + }, + { + "epoch": 4.9685270667226185, + "grad_norm": 0.7587705850601196, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 5920 + }, + { + "epoch": 4.976919848929921, + "grad_norm": 0.9103638529777527, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5930 + }, + { + "epoch": 4.985312631137222, + "grad_norm": 0.7357394695281982, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5940 + }, + { + "epoch": 4.993705413344523, + "grad_norm": 0.7371547222137451, + "learning_rate": 0.0002, + "loss": 1.3988, + "step": 5950 + }, + { + "epoch": 4.9995803608896345, + "eval_loss": 1.9367210865020752, + "eval_runtime": 37.9833, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 5957 + }, + { + "epoch": 5.0020981955518256, + "grad_norm": 0.7783351540565491, + "learning_rate": 0.0002, + "loss": 1.3876, + "step": 5960 + }, + { + "epoch": 5.010490977759127, + "grad_norm": 0.9268898367881775, + "learning_rate": 0.0002, + "loss": 1.2387, + "step": 5970 + }, + { + "epoch": 5.018883759966429, + "grad_norm": 0.9562761783599854, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 5980 + }, + { + "epoch": 5.02727654217373, + "grad_norm": 0.9391738176345825, + "learning_rate": 0.0002, + "loss": 1.205, + "step": 5990 + }, + { + "epoch": 5.035669324381033, + "grad_norm": 0.850326418876648, + "learning_rate": 0.0002, + "loss": 1.2112, + "step": 6000 + }, + { + "epoch": 5.044062106588334, + "grad_norm": 0.8442679643630981, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 6010 + }, + { + "epoch": 5.052454888795635, + "grad_norm": 1.2147290706634521, + "learning_rate": 0.0002, + "loss": 1.1677, + "step": 6020 + }, + { + "epoch": 5.0608476710029375, + "grad_norm": 0.9732922315597534, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 6030 + }, + { + "epoch": 5.069240453210239, + "grad_norm": 0.9354516267776489, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 6040 + }, + { + "epoch": 5.077633235417541, + "grad_norm": 0.9681560397148132, + "learning_rate": 0.0002, + "loss": 1.1918, + "step": 6050 + }, + { + "epoch": 5.086026017624842, + "grad_norm": 0.9500439763069153, + "learning_rate": 0.0002, + "loss": 1.2146, + "step": 6060 + }, + { + "epoch": 5.094418799832145, + "grad_norm": 0.8693879246711731, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 6070 + }, + { + "epoch": 5.102811582039446, + "grad_norm": 1.1066458225250244, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 6080 + }, + { + "epoch": 5.111204364246748, + "grad_norm": 0.9530285000801086, + "learning_rate": 0.0002, + "loss": 1.2135, + "step": 6090 + }, + { + "epoch": 5.1195971464540495, + "grad_norm": 0.9323630928993225, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 6100 + }, + { + "epoch": 5.127989928661351, + "grad_norm": 0.9040294885635376, + "learning_rate": 0.0002, + "loss": 1.2434, + "step": 6110 + }, + { + "epoch": 5.136382710868653, + "grad_norm": 0.9981122612953186, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 6120 + }, + { + "epoch": 5.144775493075954, + "grad_norm": 0.9070921540260315, + "learning_rate": 0.0002, + "loss": 1.2648, + "step": 6130 + }, + { + "epoch": 5.153168275283257, + "grad_norm": 1.043802261352539, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 6140 + }, + { + "epoch": 5.161561057490558, + "grad_norm": 1.0889761447906494, + "learning_rate": 0.0002, + "loss": 1.1865, + "step": 6150 + }, + { + "epoch": 5.16995383969786, + "grad_norm": 0.9908999800682068, + "learning_rate": 0.0002, + "loss": 1.2498, + "step": 6160 + }, + { + "epoch": 5.1783466219051615, + "grad_norm": 1.099233865737915, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 6170 + }, + { + "epoch": 5.186739404112464, + "grad_norm": 0.9536478519439697, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 6180 + }, + { + "epoch": 5.195132186319765, + "grad_norm": 0.8672952055931091, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 6190 + }, + { + "epoch": 5.203524968527066, + "grad_norm": 1.0116329193115234, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 6200 + }, + { + "epoch": 5.211917750734369, + "grad_norm": 0.9327153563499451, + "learning_rate": 0.0002, + "loss": 1.1813, + "step": 6210 + }, + { + "epoch": 5.22031053294167, + "grad_norm": 0.85637366771698, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 6220 + }, + { + "epoch": 5.228703315148972, + "grad_norm": 1.0490736961364746, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 6230 + }, + { + "epoch": 5.2370960973562735, + "grad_norm": 0.8849565982818604, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 6240 + }, + { + "epoch": 5.245488879563576, + "grad_norm": 0.8852671980857849, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 6250 + }, + { + "epoch": 5.253881661770877, + "grad_norm": 0.9146860241889954, + "learning_rate": 0.0002, + "loss": 1.275, + "step": 6260 + }, + { + "epoch": 5.262274443978178, + "grad_norm": 1.0188325643539429, + "learning_rate": 0.0002, + "loss": 1.2543, + "step": 6270 + }, + { + "epoch": 5.270667226185481, + "grad_norm": 1.0053156614303589, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 6280 + }, + { + "epoch": 5.279060008392782, + "grad_norm": 0.9962273836135864, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 6290 + }, + { + "epoch": 5.287452790600084, + "grad_norm": 1.000300645828247, + "learning_rate": 0.0002, + "loss": 1.2487, + "step": 6300 + }, + { + "epoch": 5.295845572807385, + "grad_norm": 0.9821932911872864, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 6310 + }, + { + "epoch": 5.304238355014688, + "grad_norm": 1.0103896856307983, + "learning_rate": 0.0002, + "loss": 1.2964, + "step": 6320 + }, + { + "epoch": 5.312631137221989, + "grad_norm": 0.9323601722717285, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 6330 + }, + { + "epoch": 5.321023919429291, + "grad_norm": 1.0668879747390747, + "learning_rate": 0.0002, + "loss": 1.3165, + "step": 6340 + }, + { + "epoch": 5.3294167016365925, + "grad_norm": 0.9666323065757751, + "learning_rate": 0.0002, + "loss": 1.2411, + "step": 6350 + }, + { + "epoch": 5.337809483843894, + "grad_norm": 0.9439574480056763, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 6360 + }, + { + "epoch": 5.346202266051196, + "grad_norm": 1.0229361057281494, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 6370 + }, + { + "epoch": 5.354595048258497, + "grad_norm": 0.8522404432296753, + "learning_rate": 0.0002, + "loss": 1.2021, + "step": 6380 + }, + { + "epoch": 5.3629878304658, + "grad_norm": 1.3732287883758545, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 6390 + }, + { + "epoch": 5.371380612673101, + "grad_norm": 0.8201091885566711, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 6400 + }, + { + "epoch": 5.379773394880403, + "grad_norm": 0.8874436616897583, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6410 + }, + { + "epoch": 5.3881661770877045, + "grad_norm": 1.0118640661239624, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 6420 + }, + { + "epoch": 5.396558959295007, + "grad_norm": 1.0468370914459229, + "learning_rate": 0.0002, + "loss": 1.2952, + "step": 6430 + }, + { + "epoch": 5.404951741502308, + "grad_norm": 0.941806972026825, + "learning_rate": 0.0002, + "loss": 1.2057, + "step": 6440 + }, + { + "epoch": 5.413344523709609, + "grad_norm": 0.9860424399375916, + "learning_rate": 0.0002, + "loss": 1.3289, + "step": 6450 + }, + { + "epoch": 5.421737305916912, + "grad_norm": 1.009628176689148, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 6460 + }, + { + "epoch": 5.430130088124213, + "grad_norm": 0.9842159748077393, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 6470 + }, + { + "epoch": 5.438522870331515, + "grad_norm": 0.9935571551322937, + "learning_rate": 0.0002, + "loss": 1.2277, + "step": 6480 + }, + { + "epoch": 5.4469156525388165, + "grad_norm": 0.8872362971305847, + "learning_rate": 0.0002, + "loss": 1.2392, + "step": 6490 + }, + { + "epoch": 5.455308434746119, + "grad_norm": 0.9530836939811707, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 6500 + }, + { + "epoch": 5.46370121695342, + "grad_norm": 0.8111279010772705, + "learning_rate": 0.0002, + "loss": 1.2138, + "step": 6510 + }, + { + "epoch": 5.472093999160721, + "grad_norm": 1.0474516153335571, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 6520 + }, + { + "epoch": 5.480486781368024, + "grad_norm": 1.0228482484817505, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 6530 + }, + { + "epoch": 5.488879563575325, + "grad_norm": 1.0299347639083862, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 6540 + }, + { + "epoch": 5.497272345782627, + "grad_norm": 0.9105098247528076, + "learning_rate": 0.0002, + "loss": 1.3163, + "step": 6550 + }, + { + "epoch": 5.5056651279899285, + "grad_norm": 1.2459523677825928, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 6560 + }, + { + "epoch": 5.514057910197231, + "grad_norm": 1.0630481243133545, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 6570 + }, + { + "epoch": 5.522450692404532, + "grad_norm": 0.8310980796813965, + "learning_rate": 0.0002, + "loss": 1.3003, + "step": 6580 + }, + { + "epoch": 5.530843474611833, + "grad_norm": 1.102723479270935, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 6590 + }, + { + "epoch": 5.539236256819136, + "grad_norm": 0.9586807489395142, + "learning_rate": 0.0002, + "loss": 1.2889, + "step": 6600 + }, + { + "epoch": 5.547629039026437, + "grad_norm": 0.976191520690918, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 6610 + }, + { + "epoch": 5.556021821233739, + "grad_norm": 0.9943762421607971, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 6620 + }, + { + "epoch": 5.56441460344104, + "grad_norm": 0.8788089156150818, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 6630 + }, + { + "epoch": 5.572807385648343, + "grad_norm": 0.9866173267364502, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6640 + }, + { + "epoch": 5.581200167855644, + "grad_norm": 1.0791642665863037, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 6650 + }, + { + "epoch": 5.589592950062946, + "grad_norm": 0.836482584476471, + "learning_rate": 0.0002, + "loss": 1.2806, + "step": 6660 + }, + { + "epoch": 5.5979857322702475, + "grad_norm": 0.9841130971908569, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 6670 + }, + { + "epoch": 5.60637851447755, + "grad_norm": 0.9678813815116882, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 6680 + }, + { + "epoch": 5.614771296684851, + "grad_norm": 0.9033233523368835, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 6690 + }, + { + "epoch": 5.623164078892152, + "grad_norm": 0.8691515922546387, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 6700 + }, + { + "epoch": 5.631556861099455, + "grad_norm": 0.8971360921859741, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 6710 + }, + { + "epoch": 5.639949643306756, + "grad_norm": 0.9377756118774414, + "learning_rate": 0.0002, + "loss": 1.2266, + "step": 6720 + }, + { + "epoch": 5.648342425514058, + "grad_norm": 0.908762514591217, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 6730 + }, + { + "epoch": 5.6567352077213595, + "grad_norm": 1.0503337383270264, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 6740 + }, + { + "epoch": 5.665127989928662, + "grad_norm": 1.030267357826233, + "learning_rate": 0.0002, + "loss": 1.3604, + "step": 6750 + }, + { + "epoch": 5.673520772135963, + "grad_norm": 0.9150485992431641, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 6760 + }, + { + "epoch": 5.681913554343264, + "grad_norm": 1.0300343036651611, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 6770 + }, + { + "epoch": 5.690306336550567, + "grad_norm": 1.1242924928665161, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 6780 + }, + { + "epoch": 5.698699118757868, + "grad_norm": 0.9489498138427734, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6790 + }, + { + "epoch": 5.70709190096517, + "grad_norm": 0.8829707503318787, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 6800 + }, + { + "epoch": 5.7154846831724715, + "grad_norm": 1.01392662525177, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 6810 + }, + { + "epoch": 5.723877465379774, + "grad_norm": 0.9234510064125061, + "learning_rate": 0.0002, + "loss": 1.3029, + "step": 6820 + }, + { + "epoch": 5.732270247587075, + "grad_norm": 0.9439187049865723, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 6830 + }, + { + "epoch": 5.740663029794376, + "grad_norm": 0.8833441734313965, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6840 + }, + { + "epoch": 5.749055812001679, + "grad_norm": 0.9394439458847046, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 6850 + }, + { + "epoch": 5.75744859420898, + "grad_norm": 0.9980010390281677, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 6860 + }, + { + "epoch": 5.765841376416282, + "grad_norm": 0.9612377882003784, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 6870 + }, + { + "epoch": 5.7742341586235835, + "grad_norm": 1.0817323923110962, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 6880 + }, + { + "epoch": 5.782626940830886, + "grad_norm": 0.8445103168487549, + "learning_rate": 0.0002, + "loss": 1.2485, + "step": 6890 + }, + { + "epoch": 5.791019723038187, + "grad_norm": 0.8535459041595459, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 6900 + }, + { + "epoch": 5.799412505245489, + "grad_norm": 0.9131284356117249, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 6910 + }, + { + "epoch": 5.807805287452791, + "grad_norm": 0.8627726435661316, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 6920 + }, + { + "epoch": 5.816198069660093, + "grad_norm": 0.8599951863288879, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 6930 + }, + { + "epoch": 5.824590851867394, + "grad_norm": 1.0746861696243286, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 6940 + }, + { + "epoch": 5.8329836340746954, + "grad_norm": 1.0220543146133423, + "learning_rate": 0.0002, + "loss": 1.2653, + "step": 6950 + }, + { + "epoch": 5.841376416281998, + "grad_norm": 0.8891388177871704, + "learning_rate": 0.0002, + "loss": 1.3168, + "step": 6960 + }, + { + "epoch": 5.849769198489299, + "grad_norm": 1.1404683589935303, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 6970 + }, + { + "epoch": 5.858161980696601, + "grad_norm": 0.9665380120277405, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 6980 + }, + { + "epoch": 5.8665547629039025, + "grad_norm": 0.9837968945503235, + "learning_rate": 0.0002, + "loss": 1.2622, + "step": 6990 + }, + { + "epoch": 5.874947545111205, + "grad_norm": 1.0278598070144653, + "learning_rate": 0.0002, + "loss": 1.2973, + "step": 7000 + }, + { + "epoch": 5.883340327318506, + "grad_norm": 0.9990253448486328, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 7010 + }, + { + "epoch": 5.891733109525807, + "grad_norm": 0.9705647230148315, + "learning_rate": 0.0002, + "loss": 1.3508, + "step": 7020 + }, + { + "epoch": 5.90012589173311, + "grad_norm": 0.9672252535820007, + "learning_rate": 0.0002, + "loss": 1.335, + "step": 7030 + }, + { + "epoch": 5.908518673940411, + "grad_norm": 0.9467034339904785, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 7040 + }, + { + "epoch": 5.916911456147713, + "grad_norm": 0.9506469964981079, + "learning_rate": 0.0002, + "loss": 1.2704, + "step": 7050 + }, + { + "epoch": 5.9253042383550145, + "grad_norm": 0.8936163783073425, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 7060 + }, + { + "epoch": 5.933697020562317, + "grad_norm": 0.956101655960083, + "learning_rate": 0.0002, + "loss": 1.2702, + "step": 7070 + }, + { + "epoch": 5.942089802769618, + "grad_norm": 0.893535852432251, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 7080 + }, + { + "epoch": 5.950482584976919, + "grad_norm": 1.0313799381256104, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 7090 + }, + { + "epoch": 5.958875367184222, + "grad_norm": 0.8567915558815002, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 7100 + }, + { + "epoch": 5.967268149391523, + "grad_norm": 0.9683501720428467, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 7110 + }, + { + "epoch": 5.975660931598825, + "grad_norm": 0.9401984214782715, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 7120 + }, + { + "epoch": 5.9840537138061265, + "grad_norm": 1.0316764116287231, + "learning_rate": 0.0002, + "loss": 1.3211, + "step": 7130 + }, + { + "epoch": 5.992446496013429, + "grad_norm": 0.9335392713546753, + "learning_rate": 0.0002, + "loss": 1.2445, + "step": 7140 + }, + { + "epoch": 6.0, + "eval_loss": 2.041194438934326, + "eval_runtime": 37.9642, + "eval_samples_per_second": 13.565, + "eval_steps_per_second": 1.712, + "step": 7149 + }, + { + "epoch": 6.00083927822073, + "grad_norm": 1.0247591733932495, + "learning_rate": 0.0002, + "loss": 1.2531, + "step": 7150 + }, + { + "epoch": 6.009232060428032, + "grad_norm": 1.4086190462112427, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 7160 + }, + { + "epoch": 6.017624842635334, + "grad_norm": 1.0636897087097168, + "learning_rate": 0.0002, + "loss": 1.0702, + "step": 7170 + }, + { + "epoch": 6.026017624842635, + "grad_norm": 1.1334257125854492, + "learning_rate": 0.0002, + "loss": 1.118, + "step": 7180 + }, + { + "epoch": 6.034410407049937, + "grad_norm": 1.1142425537109375, + "learning_rate": 0.0002, + "loss": 1.0428, + "step": 7190 + }, + { + "epoch": 6.0428031892572385, + "grad_norm": 1.1448479890823364, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 7200 + }, + { + "epoch": 6.051195971464541, + "grad_norm": 1.181567907333374, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 7210 + }, + { + "epoch": 6.059588753671842, + "grad_norm": 1.0471529960632324, + "learning_rate": 0.0002, + "loss": 1.0435, + "step": 7220 + }, + { + "epoch": 6.067981535879144, + "grad_norm": 1.1432698965072632, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 7230 + }, + { + "epoch": 6.076374318086446, + "grad_norm": 1.1316763162612915, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 7240 + }, + { + "epoch": 6.084767100293748, + "grad_norm": 0.9800271391868591, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7250 + }, + { + "epoch": 6.093159882501049, + "grad_norm": 1.1878576278686523, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 7260 + }, + { + "epoch": 6.1015526647083504, + "grad_norm": 1.0174267292022705, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 7270 + }, + { + "epoch": 6.109945446915653, + "grad_norm": 0.9622059464454651, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 7280 + }, + { + "epoch": 6.118338229122954, + "grad_norm": 1.3247325420379639, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 7290 + }, + { + "epoch": 6.126731011330256, + "grad_norm": 1.2405189275741577, + "learning_rate": 0.0002, + "loss": 1.0533, + "step": 7300 + }, + { + "epoch": 6.1351237935375575, + "grad_norm": 1.025123953819275, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 7310 + }, + { + "epoch": 6.14351657574486, + "grad_norm": 1.2966125011444092, + "learning_rate": 0.0002, + "loss": 1.0879, + "step": 7320 + }, + { + "epoch": 6.151909357952161, + "grad_norm": 1.0655252933502197, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 7330 + }, + { + "epoch": 6.160302140159462, + "grad_norm": 1.076251745223999, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 7340 + }, + { + "epoch": 6.168694922366765, + "grad_norm": 1.0632140636444092, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 7350 + }, + { + "epoch": 6.177087704574066, + "grad_norm": 1.392654538154602, + "learning_rate": 0.0002, + "loss": 1.1284, + "step": 7360 + }, + { + "epoch": 6.185480486781368, + "grad_norm": 1.071683645248413, + "learning_rate": 0.0002, + "loss": 1.0909, + "step": 7370 + }, + { + "epoch": 6.1938732689886695, + "grad_norm": 1.0602295398712158, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 7380 + }, + { + "epoch": 6.202266051195972, + "grad_norm": 1.2152365446090698, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 7390 + }, + { + "epoch": 6.210658833403273, + "grad_norm": 1.1637049913406372, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 7400 + }, + { + "epoch": 6.219051615610575, + "grad_norm": 1.3976062536239624, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 7410 + }, + { + "epoch": 6.227444397817877, + "grad_norm": 1.1892462968826294, + "learning_rate": 0.0002, + "loss": 1.084, + "step": 7420 + }, + { + "epoch": 6.235837180025178, + "grad_norm": 1.23629629611969, + "learning_rate": 0.0002, + "loss": 1.0517, + "step": 7430 + }, + { + "epoch": 6.24422996223248, + "grad_norm": 1.2072324752807617, + "learning_rate": 0.0002, + "loss": 1.1069, + "step": 7440 + }, + { + "epoch": 6.2526227444397815, + "grad_norm": 1.2027140855789185, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 7450 + }, + { + "epoch": 6.261015526647084, + "grad_norm": 1.2129466533660889, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 7460 + }, + { + "epoch": 6.269408308854385, + "grad_norm": 1.1675773859024048, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 7470 + }, + { + "epoch": 6.277801091061687, + "grad_norm": 1.189106822013855, + "learning_rate": 0.0002, + "loss": 1.0884, + "step": 7480 + }, + { + "epoch": 6.286193873268989, + "grad_norm": 0.9968156218528748, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 7490 + }, + { + "epoch": 6.294586655476291, + "grad_norm": 1.2140403985977173, + "learning_rate": 0.0002, + "loss": 1.1816, + "step": 7500 + }, + { + "epoch": 6.302979437683592, + "grad_norm": 1.1790717840194702, + "learning_rate": 0.0002, + "loss": 1.1163, + "step": 7510 + }, + { + "epoch": 6.3113722198908935, + "grad_norm": 1.1867438554763794, + "learning_rate": 0.0002, + "loss": 1.114, + "step": 7520 + }, + { + "epoch": 6.319765002098196, + "grad_norm": 1.2212399244308472, + "learning_rate": 0.0002, + "loss": 1.1697, + "step": 7530 + }, + { + "epoch": 6.328157784305497, + "grad_norm": 1.1840152740478516, + "learning_rate": 0.0002, + "loss": 1.1103, + "step": 7540 + }, + { + "epoch": 6.336550566512799, + "grad_norm": 1.1392520666122437, + "learning_rate": 0.0002, + "loss": 1.015, + "step": 7550 + }, + { + "epoch": 6.344943348720101, + "grad_norm": 1.2683428525924683, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 7560 + }, + { + "epoch": 6.353336130927403, + "grad_norm": 1.2927075624465942, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 7570 + }, + { + "epoch": 6.361728913134704, + "grad_norm": 1.1633557081222534, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 7580 + }, + { + "epoch": 6.3701216953420055, + "grad_norm": 1.2839789390563965, + "learning_rate": 0.0002, + "loss": 1.0448, + "step": 7590 + }, + { + "epoch": 6.378514477549308, + "grad_norm": 1.1563365459442139, + "learning_rate": 0.0002, + "loss": 1.0679, + "step": 7600 + }, + { + "epoch": 6.386907259756609, + "grad_norm": 1.3075823783874512, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 7610 + }, + { + "epoch": 6.395300041963911, + "grad_norm": 1.1148593425750732, + "learning_rate": 0.0002, + "loss": 1.1872, + "step": 7620 + }, + { + "epoch": 6.4036928241712125, + "grad_norm": 1.3017758131027222, + "learning_rate": 0.0002, + "loss": 1.1296, + "step": 7630 + }, + { + "epoch": 6.412085606378515, + "grad_norm": 1.3302847146987915, + "learning_rate": 0.0002, + "loss": 1.0982, + "step": 7640 + }, + { + "epoch": 6.420478388585816, + "grad_norm": 1.3263767957687378, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 7650 + }, + { + "epoch": 6.428871170793118, + "grad_norm": 1.2079416513442993, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 7660 + }, + { + "epoch": 6.43726395300042, + "grad_norm": 1.1282644271850586, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 7670 + }, + { + "epoch": 6.445656735207721, + "grad_norm": 1.1894482374191284, + "learning_rate": 0.0002, + "loss": 1.1437, + "step": 7680 + }, + { + "epoch": 6.454049517415023, + "grad_norm": 1.2007642984390259, + "learning_rate": 0.0002, + "loss": 1.1531, + "step": 7690 + }, + { + "epoch": 6.4624422996223245, + "grad_norm": 1.3172780275344849, + "learning_rate": 0.0002, + "loss": 1.1639, + "step": 7700 + }, + { + "epoch": 6.470835081829627, + "grad_norm": 1.113945722579956, + "learning_rate": 0.0002, + "loss": 1.1477, + "step": 7710 + }, + { + "epoch": 6.479227864036928, + "grad_norm": 1.1763832569122314, + "learning_rate": 0.0002, + "loss": 1.0852, + "step": 7720 + }, + { + "epoch": 6.48762064624423, + "grad_norm": 1.196928858757019, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 7730 + }, + { + "epoch": 6.496013428451532, + "grad_norm": 1.2109456062316895, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 7740 + }, + { + "epoch": 6.504406210658834, + "grad_norm": 1.3580254316329956, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 7750 + }, + { + "epoch": 6.512798992866135, + "grad_norm": 1.0432099103927612, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 7760 + }, + { + "epoch": 6.5211917750734365, + "grad_norm": 1.0125840902328491, + "learning_rate": 0.0002, + "loss": 1.1453, + "step": 7770 + }, + { + "epoch": 6.529584557280739, + "grad_norm": 1.5847094058990479, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 7780 + }, + { + "epoch": 6.53797733948804, + "grad_norm": 1.161391258239746, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 7790 + }, + { + "epoch": 6.546370121695342, + "grad_norm": 1.1106663942337036, + "learning_rate": 0.0002, + "loss": 1.1549, + "step": 7800 + }, + { + "epoch": 6.554762903902644, + "grad_norm": 1.2467689514160156, + "learning_rate": 0.0002, + "loss": 1.0584, + "step": 7810 + }, + { + "epoch": 6.563155686109946, + "grad_norm": 1.1907767057418823, + "learning_rate": 0.0002, + "loss": 1.0923, + "step": 7820 + }, + { + "epoch": 6.571548468317247, + "grad_norm": 1.1521105766296387, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 7830 + }, + { + "epoch": 6.5799412505245485, + "grad_norm": 1.2498128414154053, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 7840 + }, + { + "epoch": 6.588334032731851, + "grad_norm": 1.1506036520004272, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 7850 + }, + { + "epoch": 6.596726814939152, + "grad_norm": 1.118890404701233, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 7860 + }, + { + "epoch": 6.605119597146454, + "grad_norm": 1.1001442670822144, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 7870 + }, + { + "epoch": 6.613512379353756, + "grad_norm": 1.1551518440246582, + "learning_rate": 0.0002, + "loss": 1.1139, + "step": 7880 + }, + { + "epoch": 6.621905161561058, + "grad_norm": 1.1872174739837646, + "learning_rate": 0.0002, + "loss": 1.1255, + "step": 7890 + }, + { + "epoch": 6.630297943768359, + "grad_norm": 1.1665245294570923, + "learning_rate": 0.0002, + "loss": 1.1013, + "step": 7900 + }, + { + "epoch": 6.6386907259756605, + "grad_norm": 1.1592308282852173, + "learning_rate": 0.0002, + "loss": 1.1857, + "step": 7910 + }, + { + "epoch": 6.647083508182963, + "grad_norm": 1.2712409496307373, + "learning_rate": 0.0002, + "loss": 1.1639, + "step": 7920 + }, + { + "epoch": 6.655476290390264, + "grad_norm": 1.0665934085845947, + "learning_rate": 0.0002, + "loss": 1.147, + "step": 7930 + }, + { + "epoch": 6.663869072597566, + "grad_norm": 1.1843419075012207, + "learning_rate": 0.0002, + "loss": 1.1437, + "step": 7940 + }, + { + "epoch": 6.6722618548048676, + "grad_norm": 1.4945712089538574, + "learning_rate": 0.0002, + "loss": 1.1359, + "step": 7950 + }, + { + "epoch": 6.68065463701217, + "grad_norm": 1.3284149169921875, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 7960 + }, + { + "epoch": 6.689047419219471, + "grad_norm": 1.1670401096343994, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 7970 + }, + { + "epoch": 6.697440201426773, + "grad_norm": 1.1963475942611694, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 7980 + }, + { + "epoch": 6.705832983634075, + "grad_norm": 1.077380657196045, + "learning_rate": 0.0002, + "loss": 1.1489, + "step": 7990 + }, + { + "epoch": 6.714225765841377, + "grad_norm": 0.8758405447006226, + "learning_rate": 0.0002, + "loss": 1.1661, + "step": 8000 + }, + { + "epoch": 6.722618548048678, + "grad_norm": 1.2686632871627808, + "learning_rate": 0.0002, + "loss": 1.169, + "step": 8010 + }, + { + "epoch": 6.7310113302559795, + "grad_norm": 1.1136665344238281, + "learning_rate": 0.0002, + "loss": 1.1486, + "step": 8020 + }, + { + "epoch": 6.739404112463282, + "grad_norm": 1.25029456615448, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 8030 + }, + { + "epoch": 6.747796894670583, + "grad_norm": 1.0269629955291748, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 8040 + }, + { + "epoch": 6.756189676877885, + "grad_norm": 1.1515758037567139, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 8050 + }, + { + "epoch": 6.764582459085187, + "grad_norm": 1.1150308847427368, + "learning_rate": 0.0002, + "loss": 1.1487, + "step": 8060 + }, + { + "epoch": 6.772975241292489, + "grad_norm": 1.025669813156128, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 8070 + }, + { + "epoch": 6.78136802349979, + "grad_norm": 1.0564825534820557, + "learning_rate": 0.0002, + "loss": 1.1002, + "step": 8080 + }, + { + "epoch": 6.7897608057070915, + "grad_norm": 1.1695157289505005, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 8090 + }, + { + "epoch": 6.798153587914394, + "grad_norm": 1.1086713075637817, + "learning_rate": 0.0002, + "loss": 1.1322, + "step": 8100 + }, + { + "epoch": 6.806546370121695, + "grad_norm": 1.0446662902832031, + "learning_rate": 0.0002, + "loss": 1.2036, + "step": 8110 + }, + { + "epoch": 6.814939152328997, + "grad_norm": 1.2017868757247925, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 8120 + }, + { + "epoch": 6.823331934536299, + "grad_norm": 1.2538378238677979, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 8130 + }, + { + "epoch": 6.831724716743601, + "grad_norm": 1.1552783250808716, + "learning_rate": 0.0002, + "loss": 1.1506, + "step": 8140 + }, + { + "epoch": 6.840117498950902, + "grad_norm": 1.2151418924331665, + "learning_rate": 0.0002, + "loss": 1.1623, + "step": 8150 + }, + { + "epoch": 6.8485102811582035, + "grad_norm": 1.1431301832199097, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 8160 + }, + { + "epoch": 6.856903063365506, + "grad_norm": 1.0864715576171875, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 8170 + }, + { + "epoch": 6.865295845572807, + "grad_norm": 1.2602605819702148, + "learning_rate": 0.0002, + "loss": 1.1777, + "step": 8180 + }, + { + "epoch": 6.873688627780109, + "grad_norm": 1.1670788526535034, + "learning_rate": 0.0002, + "loss": 1.1237, + "step": 8190 + }, + { + "epoch": 6.882081409987411, + "grad_norm": 1.1444851160049438, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 8200 + }, + { + "epoch": 6.890474192194713, + "grad_norm": 1.1726973056793213, + "learning_rate": 0.0002, + "loss": 1.1208, + "step": 8210 + }, + { + "epoch": 6.898866974402014, + "grad_norm": 1.0436229705810547, + "learning_rate": 0.0002, + "loss": 1.1666, + "step": 8220 + }, + { + "epoch": 6.907259756609316, + "grad_norm": 1.3296568393707275, + "learning_rate": 0.0002, + "loss": 1.097, + "step": 8230 + }, + { + "epoch": 6.915652538816618, + "grad_norm": 1.2561821937561035, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 8240 + }, + { + "epoch": 6.92404532102392, + "grad_norm": 1.2071776390075684, + "learning_rate": 0.0002, + "loss": 1.2125, + "step": 8250 + }, + { + "epoch": 6.932438103231221, + "grad_norm": 1.115523099899292, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 8260 + }, + { + "epoch": 6.940830885438523, + "grad_norm": 1.145468831062317, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 8270 + }, + { + "epoch": 6.949223667645825, + "grad_norm": 1.2517759799957275, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 8280 + }, + { + "epoch": 6.957616449853126, + "grad_norm": 1.1757365465164185, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 8290 + }, + { + "epoch": 6.966009232060428, + "grad_norm": 1.0645636320114136, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 8300 + }, + { + "epoch": 6.97440201426773, + "grad_norm": 1.2390278577804565, + "learning_rate": 0.0002, + "loss": 1.2665, + "step": 8310 + }, + { + "epoch": 6.982794796475032, + "grad_norm": 1.202418327331543, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 8320 + }, + { + "epoch": 6.991187578682333, + "grad_norm": 1.0840344429016113, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 8330 + }, + { + "epoch": 6.9995803608896345, + "grad_norm": 1.2504760026931763, + "learning_rate": 0.0002, + "loss": 1.1172, + "step": 8340 + }, + { + "epoch": 6.9995803608896345, + "eval_loss": 2.1729838848114014, + "eval_runtime": 37.9703, + "eval_samples_per_second": 13.563, + "eval_steps_per_second": 1.712, + "step": 8340 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.8597966486411674e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-8340/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1835b8bf06e2070e580b58d4068615c693422108 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:010e95e4cf23809bd695d0098ca7bf2d1ada2754cd24bb22031769cebd4a1124 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..312f2f06c0356c85b412c4845c6a141a4a5cbf94 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6844a461e1ee4ea9efdc0ed69ac7a11246ac9b75775fcb8ba2959039ab09d662 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6fb7d7591ff9359f57064d79631a7276ce087657 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49f897e847a9add91ca72d31af7210cf68cb737c85ef1e265b1cd547162f5c91 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f42a3b698841256e9cfebe37f1a1dd1e7807df99 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3015d21a0b628b16f9e41c830a5bb7ebe56d078c9c02fac4bd4b2d5c6901010c +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00ab3aadf7f7031e27aa37ab9edbf6d5eafa47e6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/trainer_state.json @@ -0,0 +1,6761 @@ +{ + "best_metric": 1.807437539100647, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", + "epoch": 7.996642887117079, + "eval_steps": 10, + "global_step": 9528, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00839278220730172, + "grad_norm": 0.6016407012939453, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.01678556441460344, + "grad_norm": 0.5444163084030151, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 20 + }, + { + "epoch": 0.02517834662190516, + "grad_norm": 0.5771743059158325, + "learning_rate": 0.0002, + "loss": 2.004, + "step": 30 + }, + { + "epoch": 0.03357112882920688, + "grad_norm": 0.5426492094993591, + "learning_rate": 0.0002, + "loss": 1.9819, + "step": 40 + }, + { + "epoch": 0.0419639110365086, + "grad_norm": 0.5884947180747986, + "learning_rate": 0.0002, + "loss": 2.0078, + "step": 50 + }, + { + "epoch": 0.05035669324381032, + "grad_norm": 0.47584953904151917, + "learning_rate": 0.0002, + "loss": 1.875, + "step": 60 + }, + { + "epoch": 0.058749475451112046, + "grad_norm": 0.529290497303009, + "learning_rate": 0.0002, + "loss": 1.8831, + "step": 70 + }, + { + "epoch": 0.06714225765841376, + "grad_norm": 0.48883911967277527, + "learning_rate": 0.0002, + "loss": 1.9296, + "step": 80 + }, + { + "epoch": 0.07553503986571548, + "grad_norm": 0.4272284209728241, + "learning_rate": 0.0002, + "loss": 1.8456, + "step": 90 + }, + { + "epoch": 0.0839278220730172, + "grad_norm": 0.42270252108573914, + "learning_rate": 0.0002, + "loss": 1.9089, + "step": 100 + }, + { + "epoch": 0.09232060428031892, + "grad_norm": 0.45384910702705383, + "learning_rate": 0.0002, + "loss": 1.8279, + "step": 110 + }, + { + "epoch": 0.10071338648762064, + "grad_norm": 0.37896445393562317, + "learning_rate": 0.0002, + "loss": 1.9126, + "step": 120 + }, + { + "epoch": 0.10910616869492237, + "grad_norm": 0.4134417176246643, + "learning_rate": 0.0002, + "loss": 1.8618, + "step": 130 + }, + { + "epoch": 0.11749895090222409, + "grad_norm": 0.42598405480384827, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 140 + }, + { + "epoch": 0.1258917331095258, + "grad_norm": 0.39050817489624023, + "learning_rate": 0.0002, + "loss": 1.8056, + "step": 150 + }, + { + "epoch": 0.13428451531682753, + "grad_norm": 0.3783605098724365, + "learning_rate": 0.0002, + "loss": 1.8912, + "step": 160 + }, + { + "epoch": 0.14267729752412925, + "grad_norm": 0.4229804575443268, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 170 + }, + { + "epoch": 0.15107007973143097, + "grad_norm": 0.3557824194431305, + "learning_rate": 0.0002, + "loss": 1.8183, + "step": 180 + }, + { + "epoch": 0.1594628619387327, + "grad_norm": 0.37380388379096985, + "learning_rate": 0.0002, + "loss": 1.8105, + "step": 190 + }, + { + "epoch": 0.1678556441460344, + "grad_norm": 0.3803510367870331, + "learning_rate": 0.0002, + "loss": 1.907, + "step": 200 + }, + { + "epoch": 0.17624842635333612, + "grad_norm": 0.5078789591789246, + "learning_rate": 0.0002, + "loss": 1.7942, + "step": 210 + }, + { + "epoch": 0.18464120856063784, + "grad_norm": 1.8922057151794434, + "learning_rate": 0.0002, + "loss": 1.7683, + "step": 220 + }, + { + "epoch": 0.19303399076793956, + "grad_norm": 0.36936357617378235, + "learning_rate": 0.0002, + "loss": 1.8617, + "step": 230 + }, + { + "epoch": 0.20142677297524128, + "grad_norm": 0.41423121094703674, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 240 + }, + { + "epoch": 0.209819555182543, + "grad_norm": 0.3869935870170593, + "learning_rate": 0.0002, + "loss": 1.8249, + "step": 250 + }, + { + "epoch": 0.21821233738984475, + "grad_norm": 0.35073965787887573, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 260 + }, + { + "epoch": 0.22660511959714646, + "grad_norm": 0.3748358190059662, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 270 + }, + { + "epoch": 0.23499790180444818, + "grad_norm": 0.36887043714523315, + "learning_rate": 0.0002, + "loss": 1.8534, + "step": 280 + }, + { + "epoch": 0.2433906840117499, + "grad_norm": 0.36038365960121155, + "learning_rate": 0.0002, + "loss": 1.8645, + "step": 290 + }, + { + "epoch": 0.2517834662190516, + "grad_norm": 0.36350926756858826, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 300 + }, + { + "epoch": 0.26017624842635334, + "grad_norm": 0.351936936378479, + "learning_rate": 0.0002, + "loss": 1.8339, + "step": 310 + }, + { + "epoch": 0.26856903063365506, + "grad_norm": 0.35942426323890686, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 320 + }, + { + "epoch": 0.2769618128409568, + "grad_norm": 0.39852434396743774, + "learning_rate": 0.0002, + "loss": 1.8205, + "step": 330 + }, + { + "epoch": 0.2853545950482585, + "grad_norm": 0.3282669186592102, + "learning_rate": 0.0002, + "loss": 1.8598, + "step": 340 + }, + { + "epoch": 0.2937473772555602, + "grad_norm": 0.3388650417327881, + "learning_rate": 0.0002, + "loss": 1.8164, + "step": 350 + }, + { + "epoch": 0.30214015946286193, + "grad_norm": 0.31616076827049255, + "learning_rate": 0.0002, + "loss": 1.784, + "step": 360 + }, + { + "epoch": 0.31053294167016365, + "grad_norm": 0.34184730052948, + "learning_rate": 0.0002, + "loss": 1.8365, + "step": 370 + }, + { + "epoch": 0.3189257238774654, + "grad_norm": 0.3599095344543457, + "learning_rate": 0.0002, + "loss": 1.8051, + "step": 380 + }, + { + "epoch": 0.3273185060847671, + "grad_norm": 0.3970130681991577, + "learning_rate": 0.0002, + "loss": 1.8274, + "step": 390 + }, + { + "epoch": 0.3357112882920688, + "grad_norm": 0.40854907035827637, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 400 + }, + { + "epoch": 0.34410407049937053, + "grad_norm": 0.33014851808547974, + "learning_rate": 0.0002, + "loss": 1.8403, + "step": 410 + }, + { + "epoch": 0.35249685270667225, + "grad_norm": 0.3269062042236328, + "learning_rate": 0.0002, + "loss": 1.825, + "step": 420 + }, + { + "epoch": 0.36088963491397397, + "grad_norm": 0.35455429553985596, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 430 + }, + { + "epoch": 0.3692824171212757, + "grad_norm": 0.34339913725852966, + "learning_rate": 0.0002, + "loss": 1.8299, + "step": 440 + }, + { + "epoch": 0.3776751993285774, + "grad_norm": 0.34326961636543274, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 450 + }, + { + "epoch": 0.3860679815358791, + "grad_norm": 0.33944424986839294, + "learning_rate": 0.0002, + "loss": 1.7931, + "step": 460 + }, + { + "epoch": 0.39446076374318084, + "grad_norm": 0.3673107326030731, + "learning_rate": 0.0002, + "loss": 1.8445, + "step": 470 + }, + { + "epoch": 0.40285354595048256, + "grad_norm": 0.40028971433639526, + "learning_rate": 0.0002, + "loss": 1.7105, + "step": 480 + }, + { + "epoch": 0.4112463281577843, + "grad_norm": 0.4117187261581421, + "learning_rate": 0.0002, + "loss": 1.7771, + "step": 490 + }, + { + "epoch": 0.419639110365086, + "grad_norm": 0.31541067361831665, + "learning_rate": 0.0002, + "loss": 1.768, + "step": 500 + }, + { + "epoch": 0.4280318925723878, + "grad_norm": 0.32634997367858887, + "learning_rate": 0.0002, + "loss": 1.7757, + "step": 510 + }, + { + "epoch": 0.4364246747796895, + "grad_norm": 0.3255768120288849, + "learning_rate": 0.0002, + "loss": 1.793, + "step": 520 + }, + { + "epoch": 0.4448174569869912, + "grad_norm": 0.34764620661735535, + "learning_rate": 0.0002, + "loss": 1.7375, + "step": 530 + }, + { + "epoch": 0.45321023919429293, + "grad_norm": 0.36379843950271606, + "learning_rate": 0.0002, + "loss": 1.8421, + "step": 540 + }, + { + "epoch": 0.46160302140159465, + "grad_norm": 0.37775811553001404, + "learning_rate": 0.0002, + "loss": 1.8103, + "step": 550 + }, + { + "epoch": 0.46999580360889637, + "grad_norm": 0.3421199917793274, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 560 + }, + { + "epoch": 0.4783885858161981, + "grad_norm": 0.3447427749633789, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 570 + }, + { + "epoch": 0.4867813680234998, + "grad_norm": 0.38283416628837585, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 580 + }, + { + "epoch": 0.4951741502308015, + "grad_norm": 0.34281104803085327, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 590 + }, + { + "epoch": 0.5035669324381032, + "grad_norm": 0.35317757725715637, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 600 + }, + { + "epoch": 0.5119597146454049, + "grad_norm": 0.34344494342803955, + "learning_rate": 0.0002, + "loss": 1.829, + "step": 610 + }, + { + "epoch": 0.5203524968527067, + "grad_norm": 0.3168846666812897, + "learning_rate": 0.0002, + "loss": 1.84, + "step": 620 + }, + { + "epoch": 0.5287452790600083, + "grad_norm": 0.570289671421051, + "learning_rate": 0.0002, + "loss": 1.8811, + "step": 630 + }, + { + "epoch": 0.5371380612673101, + "grad_norm": 0.32985877990722656, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 640 + }, + { + "epoch": 0.5455308434746118, + "grad_norm": 0.418250173330307, + "learning_rate": 0.0002, + "loss": 1.8455, + "step": 650 + }, + { + "epoch": 0.5539236256819136, + "grad_norm": 0.34269577264785767, + "learning_rate": 0.0002, + "loss": 1.7127, + "step": 660 + }, + { + "epoch": 0.5623164078892152, + "grad_norm": 0.6531919240951538, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 670 + }, + { + "epoch": 0.570709190096517, + "grad_norm": 0.3711959719657898, + "learning_rate": 0.0002, + "loss": 1.7499, + "step": 680 + }, + { + "epoch": 0.5791019723038188, + "grad_norm": 0.3916425108909607, + "learning_rate": 0.0002, + "loss": 1.802, + "step": 690 + }, + { + "epoch": 0.5874947545111204, + "grad_norm": 0.31316208839416504, + "learning_rate": 0.0002, + "loss": 1.8752, + "step": 700 + }, + { + "epoch": 0.5958875367184222, + "grad_norm": 0.35153743624687195, + "learning_rate": 0.0002, + "loss": 1.8222, + "step": 710 + }, + { + "epoch": 0.6042803189257239, + "grad_norm": 0.34590575098991394, + "learning_rate": 0.0002, + "loss": 1.7817, + "step": 720 + }, + { + "epoch": 0.6126731011330256, + "grad_norm": 0.2984001040458679, + "learning_rate": 0.0002, + "loss": 1.8062, + "step": 730 + }, + { + "epoch": 0.6210658833403273, + "grad_norm": 0.3588712513446808, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 740 + }, + { + "epoch": 0.6294586655476291, + "grad_norm": 0.3288203179836273, + "learning_rate": 0.0002, + "loss": 1.7652, + "step": 750 + }, + { + "epoch": 0.6378514477549307, + "grad_norm": 0.3102910816669464, + "learning_rate": 0.0002, + "loss": 1.799, + "step": 760 + }, + { + "epoch": 0.6462442299622325, + "grad_norm": 0.42002803087234497, + "learning_rate": 0.0002, + "loss": 1.8746, + "step": 770 + }, + { + "epoch": 0.6546370121695342, + "grad_norm": 0.35616543889045715, + "learning_rate": 0.0002, + "loss": 1.8726, + "step": 780 + }, + { + "epoch": 0.663029794376836, + "grad_norm": 0.37670427560806274, + "learning_rate": 0.0002, + "loss": 1.8118, + "step": 790 + }, + { + "epoch": 0.6714225765841376, + "grad_norm": 0.3410654664039612, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 800 + }, + { + "epoch": 0.6798153587914394, + "grad_norm": 0.2916128635406494, + "learning_rate": 0.0002, + "loss": 1.7782, + "step": 810 + }, + { + "epoch": 0.6882081409987411, + "grad_norm": 0.3147228956222534, + "learning_rate": 0.0002, + "loss": 1.8057, + "step": 820 + }, + { + "epoch": 0.6966009232060428, + "grad_norm": 0.3593887984752655, + "learning_rate": 0.0002, + "loss": 1.7826, + "step": 830 + }, + { + "epoch": 0.7049937054133445, + "grad_norm": 0.29242461919784546, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 840 + }, + { + "epoch": 0.7133864876206463, + "grad_norm": 0.32993558049201965, + "learning_rate": 0.0002, + "loss": 1.8083, + "step": 850 + }, + { + "epoch": 0.7217792698279479, + "grad_norm": 0.3939134478569031, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 860 + }, + { + "epoch": 0.7301720520352497, + "grad_norm": 0.3476874828338623, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 870 + }, + { + "epoch": 0.7385648342425514, + "grad_norm": 0.324367880821228, + "learning_rate": 0.0002, + "loss": 1.8127, + "step": 880 + }, + { + "epoch": 0.7469576164498531, + "grad_norm": 0.29460495710372925, + "learning_rate": 0.0002, + "loss": 1.7533, + "step": 890 + }, + { + "epoch": 0.7553503986571548, + "grad_norm": 0.37918367981910706, + "learning_rate": 0.0002, + "loss": 1.7544, + "step": 900 + }, + { + "epoch": 0.7637431808644566, + "grad_norm": 0.3517799973487854, + "learning_rate": 0.0002, + "loss": 1.7579, + "step": 910 + }, + { + "epoch": 0.7721359630717582, + "grad_norm": 0.3069603443145752, + "learning_rate": 0.0002, + "loss": 1.7895, + "step": 920 + }, + { + "epoch": 0.78052874527906, + "grad_norm": 0.3776717483997345, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 930 + }, + { + "epoch": 0.7889215274863617, + "grad_norm": 0.4474868178367615, + "learning_rate": 0.0002, + "loss": 1.8663, + "step": 940 + }, + { + "epoch": 0.7973143096936635, + "grad_norm": 0.3259398639202118, + "learning_rate": 0.0002, + "loss": 1.7976, + "step": 950 + }, + { + "epoch": 0.8057070919009651, + "grad_norm": 0.3109343647956848, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 960 + }, + { + "epoch": 0.8140998741082669, + "grad_norm": 0.3707215189933777, + "learning_rate": 0.0002, + "loss": 1.8035, + "step": 970 + }, + { + "epoch": 0.8224926563155686, + "grad_norm": 0.3671801686286926, + "learning_rate": 0.0002, + "loss": 1.851, + "step": 980 + }, + { + "epoch": 0.8308854385228703, + "grad_norm": 0.3278632164001465, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 990 + }, + { + "epoch": 0.839278220730172, + "grad_norm": 0.32587629556655884, + "learning_rate": 0.0002, + "loss": 1.7679, + "step": 1000 + }, + { + "epoch": 0.8476710029374738, + "grad_norm": 0.3705422878265381, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1010 + }, + { + "epoch": 0.8560637851447755, + "grad_norm": 0.43461498618125916, + "learning_rate": 0.0002, + "loss": 1.7723, + "step": 1020 + }, + { + "epoch": 0.8644565673520772, + "grad_norm": 0.30326616764068604, + "learning_rate": 0.0002, + "loss": 1.7528, + "step": 1030 + }, + { + "epoch": 0.872849349559379, + "grad_norm": 0.3383970260620117, + "learning_rate": 0.0002, + "loss": 1.7688, + "step": 1040 + }, + { + "epoch": 0.8812421317666806, + "grad_norm": 0.3041667640209198, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1050 + }, + { + "epoch": 0.8896349139739824, + "grad_norm": 0.4173165261745453, + "learning_rate": 0.0002, + "loss": 1.8515, + "step": 1060 + }, + { + "epoch": 0.8980276961812841, + "grad_norm": 0.394760400056839, + "learning_rate": 0.0002, + "loss": 1.8217, + "step": 1070 + }, + { + "epoch": 0.9064204783885859, + "grad_norm": 0.32503336668014526, + "learning_rate": 0.0002, + "loss": 1.7425, + "step": 1080 + }, + { + "epoch": 0.9148132605958875, + "grad_norm": 0.339996337890625, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 1090 + }, + { + "epoch": 0.9232060428031893, + "grad_norm": 0.3512224555015564, + "learning_rate": 0.0002, + "loss": 1.7893, + "step": 1100 + }, + { + "epoch": 0.931598825010491, + "grad_norm": 0.458159863948822, + "learning_rate": 0.0002, + "loss": 1.8027, + "step": 1110 + }, + { + "epoch": 0.9399916072177927, + "grad_norm": 0.3467862904071808, + "learning_rate": 0.0002, + "loss": 1.7974, + "step": 1120 + }, + { + "epoch": 0.9483843894250944, + "grad_norm": 0.3274364173412323, + "learning_rate": 0.0002, + "loss": 1.836, + "step": 1130 + }, + { + "epoch": 0.9567771716323962, + "grad_norm": 0.3269580006599426, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 1140 + }, + { + "epoch": 0.9651699538396978, + "grad_norm": 0.31564876437187195, + "learning_rate": 0.0002, + "loss": 1.8383, + "step": 1150 + }, + { + "epoch": 0.9735627360469996, + "grad_norm": 0.32907289266586304, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1160 + }, + { + "epoch": 0.9819555182543013, + "grad_norm": 0.3564138412475586, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1170 + }, + { + "epoch": 0.990348300461603, + "grad_norm": 0.32875651121139526, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1180 + }, + { + "epoch": 0.9987410826689047, + "grad_norm": 0.3225541114807129, + "learning_rate": 0.0002, + "loss": 1.7232, + "step": 1190 + }, + { + "epoch": 0.9995803608896349, + "eval_loss": 1.8086129426956177, + "eval_runtime": 38.0431, + "eval_samples_per_second": 13.537, + "eval_steps_per_second": 1.709, + "step": 1191 + }, + { + "epoch": 1.0071338648762065, + "grad_norm": 0.3235187232494354, + "learning_rate": 0.0002, + "loss": 1.6856, + "step": 1200 + }, + { + "epoch": 1.0155266470835083, + "grad_norm": 0.34884774684906006, + "learning_rate": 0.0002, + "loss": 1.7121, + "step": 1210 + }, + { + "epoch": 1.0239194292908098, + "grad_norm": 0.3215438425540924, + "learning_rate": 0.0002, + "loss": 1.6779, + "step": 1220 + }, + { + "epoch": 1.0323122114981116, + "grad_norm": 0.312084823846817, + "learning_rate": 0.0002, + "loss": 1.6562, + "step": 1230 + }, + { + "epoch": 1.0407049937054134, + "grad_norm": 0.33597758412361145, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 1240 + }, + { + "epoch": 1.0490977759127151, + "grad_norm": 0.3421499729156494, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1250 + }, + { + "epoch": 1.0574905581200167, + "grad_norm": 0.3458889126777649, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1260 + }, + { + "epoch": 1.0658833403273185, + "grad_norm": 0.3956579864025116, + "learning_rate": 0.0002, + "loss": 1.6929, + "step": 1270 + }, + { + "epoch": 1.0742761225346202, + "grad_norm": 0.3217819035053253, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 1280 + }, + { + "epoch": 1.082668904741922, + "grad_norm": 0.31379663944244385, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1290 + }, + { + "epoch": 1.0910616869492236, + "grad_norm": 0.37231558561325073, + "learning_rate": 0.0002, + "loss": 1.6331, + "step": 1300 + }, + { + "epoch": 1.0994544691565253, + "grad_norm": 0.35857918858528137, + "learning_rate": 0.0002, + "loss": 1.6614, + "step": 1310 + }, + { + "epoch": 1.1078472513638271, + "grad_norm": 0.36637991666793823, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 1320 + }, + { + "epoch": 1.1162400335711289, + "grad_norm": 0.3436494469642639, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 1330 + }, + { + "epoch": 1.1246328157784307, + "grad_norm": 0.404908150434494, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 1340 + }, + { + "epoch": 1.1330255979857322, + "grad_norm": 0.34587544202804565, + "learning_rate": 0.0002, + "loss": 1.7042, + "step": 1350 + }, + { + "epoch": 1.141418380193034, + "grad_norm": 0.35142362117767334, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 1360 + }, + { + "epoch": 1.1498111624003358, + "grad_norm": 0.3511804938316345, + "learning_rate": 0.0002, + "loss": 1.6781, + "step": 1370 + }, + { + "epoch": 1.1582039446076373, + "grad_norm": 0.3549560308456421, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 1380 + }, + { + "epoch": 1.166596726814939, + "grad_norm": 0.35797521471977234, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 1390 + }, + { + "epoch": 1.1749895090222409, + "grad_norm": 0.37255269289016724, + "learning_rate": 0.0002, + "loss": 1.7476, + "step": 1400 + }, + { + "epoch": 1.1833822912295426, + "grad_norm": 0.3680652379989624, + "learning_rate": 0.0002, + "loss": 1.7274, + "step": 1410 + }, + { + "epoch": 1.1917750734368444, + "grad_norm": 0.400831013917923, + "learning_rate": 0.0002, + "loss": 1.6751, + "step": 1420 + }, + { + "epoch": 1.200167855644146, + "grad_norm": 0.39571020007133484, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1430 + }, + { + "epoch": 1.2085606378514477, + "grad_norm": 0.3843863010406494, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 1440 + }, + { + "epoch": 1.2169534200587495, + "grad_norm": 0.3901960551738739, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1450 + }, + { + "epoch": 1.2253462022660513, + "grad_norm": 0.36490726470947266, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 1460 + }, + { + "epoch": 1.2337389844733528, + "grad_norm": 0.3739864230155945, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1470 + }, + { + "epoch": 1.2421317666806546, + "grad_norm": 0.39061254262924194, + "learning_rate": 0.0002, + "loss": 1.6795, + "step": 1480 + }, + { + "epoch": 1.2505245488879564, + "grad_norm": 0.37198659777641296, + "learning_rate": 0.0002, + "loss": 1.6838, + "step": 1490 + }, + { + "epoch": 1.2589173310952582, + "grad_norm": 0.3420586884021759, + "learning_rate": 0.0002, + "loss": 1.725, + "step": 1500 + }, + { + "epoch": 1.2673101133025597, + "grad_norm": 0.4094347655773163, + "learning_rate": 0.0002, + "loss": 1.719, + "step": 1510 + }, + { + "epoch": 1.2757028955098615, + "grad_norm": 0.38997703790664673, + "learning_rate": 0.0002, + "loss": 1.7563, + "step": 1520 + }, + { + "epoch": 1.2840956777171633, + "grad_norm": 0.35702022910118103, + "learning_rate": 0.0002, + "loss": 1.6651, + "step": 1530 + }, + { + "epoch": 1.292488459924465, + "grad_norm": 0.3892163336277008, + "learning_rate": 0.0002, + "loss": 1.6689, + "step": 1540 + }, + { + "epoch": 1.3008812421317666, + "grad_norm": 0.33174318075180054, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 1550 + }, + { + "epoch": 1.3092740243390684, + "grad_norm": 0.40701809525489807, + "learning_rate": 0.0002, + "loss": 1.7581, + "step": 1560 + }, + { + "epoch": 1.3176668065463701, + "grad_norm": 0.36324232816696167, + "learning_rate": 0.0002, + "loss": 1.7229, + "step": 1570 + }, + { + "epoch": 1.326059588753672, + "grad_norm": 0.3748789429664612, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 1580 + }, + { + "epoch": 1.3344523709609737, + "grad_norm": 0.40873438119888306, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 1590 + }, + { + "epoch": 1.3428451531682752, + "grad_norm": 0.52373206615448, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1600 + }, + { + "epoch": 1.351237935375577, + "grad_norm": 0.40408164262771606, + "learning_rate": 0.0002, + "loss": 1.7593, + "step": 1610 + }, + { + "epoch": 1.3596307175828788, + "grad_norm": 0.3818126320838928, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 1.3680234997901803, + "grad_norm": 0.3457068204879761, + "learning_rate": 0.0002, + "loss": 1.6328, + "step": 1630 + }, + { + "epoch": 1.3764162819974821, + "grad_norm": 0.33777865767478943, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 1640 + }, + { + "epoch": 1.384809064204784, + "grad_norm": 0.36344218254089355, + "learning_rate": 0.0002, + "loss": 1.7335, + "step": 1650 + }, + { + "epoch": 1.3932018464120857, + "grad_norm": 0.3880128562450409, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 1660 + }, + { + "epoch": 1.4015946286193874, + "grad_norm": 0.3906225562095642, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1670 + }, + { + "epoch": 1.409987410826689, + "grad_norm": 0.35857489705085754, + "learning_rate": 0.0002, + "loss": 1.7041, + "step": 1680 + }, + { + "epoch": 1.4183801930339908, + "grad_norm": 0.3627418279647827, + "learning_rate": 0.0002, + "loss": 1.7175, + "step": 1690 + }, + { + "epoch": 1.4267729752412925, + "grad_norm": 0.41963326930999756, + "learning_rate": 0.0002, + "loss": 1.6948, + "step": 1700 + }, + { + "epoch": 1.435165757448594, + "grad_norm": 0.36280378699302673, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 1.4435585396558959, + "grad_norm": 0.3868233561515808, + "learning_rate": 0.0002, + "loss": 1.7775, + "step": 1720 + }, + { + "epoch": 1.4519513218631976, + "grad_norm": 0.3635849356651306, + "learning_rate": 0.0002, + "loss": 1.6963, + "step": 1730 + }, + { + "epoch": 1.4603441040704994, + "grad_norm": 0.4885194003582001, + "learning_rate": 0.0002, + "loss": 1.7381, + "step": 1740 + }, + { + "epoch": 1.4687368862778012, + "grad_norm": 0.35194680094718933, + "learning_rate": 0.0002, + "loss": 1.6661, + "step": 1750 + }, + { + "epoch": 1.4771296684851027, + "grad_norm": 0.34906691312789917, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 1760 + }, + { + "epoch": 1.4855224506924045, + "grad_norm": 0.3994184732437134, + "learning_rate": 0.0002, + "loss": 1.7196, + "step": 1770 + }, + { + "epoch": 1.4939152328997063, + "grad_norm": 0.3599298298358917, + "learning_rate": 0.0002, + "loss": 1.7157, + "step": 1780 + }, + { + "epoch": 1.5023080151070078, + "grad_norm": 0.3794984221458435, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 1790 + }, + { + "epoch": 1.5107007973143096, + "grad_norm": 0.36289724707603455, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 1800 + }, + { + "epoch": 1.5190935795216114, + "grad_norm": 0.38057321310043335, + "learning_rate": 0.0002, + "loss": 1.78, + "step": 1810 + }, + { + "epoch": 1.5274863617289132, + "grad_norm": 0.3771969676017761, + "learning_rate": 0.0002, + "loss": 1.7006, + "step": 1820 + }, + { + "epoch": 1.535879143936215, + "grad_norm": 0.34788841009140015, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 1830 + }, + { + "epoch": 1.5442719261435167, + "grad_norm": 0.41352227330207825, + "learning_rate": 0.0002, + "loss": 1.7148, + "step": 1840 + }, + { + "epoch": 1.5526647083508183, + "grad_norm": 0.35711410641670227, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1850 + }, + { + "epoch": 1.56105749055812, + "grad_norm": 0.40607622265815735, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1860 + }, + { + "epoch": 1.5694502727654216, + "grad_norm": 0.3428550660610199, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 1870 + }, + { + "epoch": 1.5778430549727234, + "grad_norm": 0.3695414066314697, + "learning_rate": 0.0002, + "loss": 1.7909, + "step": 1880 + }, + { + "epoch": 1.5862358371800251, + "grad_norm": 0.3798272907733917, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1890 + }, + { + "epoch": 1.594628619387327, + "grad_norm": 0.3415829837322235, + "learning_rate": 0.0002, + "loss": 1.7412, + "step": 1900 + }, + { + "epoch": 1.6030214015946287, + "grad_norm": 0.3575693666934967, + "learning_rate": 0.0002, + "loss": 1.8233, + "step": 1910 + }, + { + "epoch": 1.6114141838019305, + "grad_norm": 0.3180370628833771, + "learning_rate": 0.0002, + "loss": 1.6947, + "step": 1920 + }, + { + "epoch": 1.619806966009232, + "grad_norm": 0.5018689036369324, + "learning_rate": 0.0002, + "loss": 1.7506, + "step": 1930 + }, + { + "epoch": 1.6281997482165338, + "grad_norm": 0.35676372051239014, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1940 + }, + { + "epoch": 1.6365925304238353, + "grad_norm": 0.3740452229976654, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 1950 + }, + { + "epoch": 1.6449853126311371, + "grad_norm": 0.36584731936454773, + "learning_rate": 0.0002, + "loss": 1.6474, + "step": 1960 + }, + { + "epoch": 1.653378094838439, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.0002, + "loss": 1.7306, + "step": 1970 + }, + { + "epoch": 1.6617708770457407, + "grad_norm": 0.4114968776702881, + "learning_rate": 0.0002, + "loss": 1.7694, + "step": 1980 + }, + { + "epoch": 1.6701636592530424, + "grad_norm": 0.3665498197078705, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 1990 + }, + { + "epoch": 1.6785564414603442, + "grad_norm": 0.36579379439353943, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 2000 + }, + { + "epoch": 1.6869492236676458, + "grad_norm": 0.3813064694404602, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 2010 + }, + { + "epoch": 1.6953420058749475, + "grad_norm": 0.33390694856643677, + "learning_rate": 0.0002, + "loss": 1.7566, + "step": 2020 + }, + { + "epoch": 1.7037347880822493, + "grad_norm": 0.3668614327907562, + "learning_rate": 0.0002, + "loss": 1.6576, + "step": 2030 + }, + { + "epoch": 1.7121275702895509, + "grad_norm": 0.352028489112854, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2040 + }, + { + "epoch": 1.7205203524968526, + "grad_norm": 0.33639830350875854, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2050 + }, + { + "epoch": 1.7289131347041544, + "grad_norm": 0.39217695593833923, + "learning_rate": 0.0002, + "loss": 1.7868, + "step": 2060 + }, + { + "epoch": 1.7373059169114562, + "grad_norm": 0.42593324184417725, + "learning_rate": 0.0002, + "loss": 1.7608, + "step": 2070 + }, + { + "epoch": 1.745698699118758, + "grad_norm": 0.362215518951416, + "learning_rate": 0.0002, + "loss": 1.722, + "step": 2080 + }, + { + "epoch": 1.7540914813260597, + "grad_norm": 0.4087955057621002, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2090 + }, + { + "epoch": 1.7624842635333613, + "grad_norm": 0.35127750039100647, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 2100 + }, + { + "epoch": 1.770877045740663, + "grad_norm": 0.33677494525909424, + "learning_rate": 0.0002, + "loss": 1.7405, + "step": 2110 + }, + { + "epoch": 1.7792698279479646, + "grad_norm": 0.39616644382476807, + "learning_rate": 0.0002, + "loss": 1.7478, + "step": 2120 + }, + { + "epoch": 1.7876626101552664, + "grad_norm": 0.4705100953578949, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 2130 + }, + { + "epoch": 1.7960553923625682, + "grad_norm": 0.3893914818763733, + "learning_rate": 0.0002, + "loss": 1.75, + "step": 2140 + }, + { + "epoch": 1.80444817456987, + "grad_norm": 0.3344813585281372, + "learning_rate": 0.0002, + "loss": 1.6711, + "step": 2150 + }, + { + "epoch": 1.8128409567771717, + "grad_norm": 0.36502110958099365, + "learning_rate": 0.0002, + "loss": 1.8329, + "step": 2160 + }, + { + "epoch": 1.8212337389844735, + "grad_norm": 0.3422985374927521, + "learning_rate": 0.0002, + "loss": 1.753, + "step": 2170 + }, + { + "epoch": 1.829626521191775, + "grad_norm": 0.44039851427078247, + "learning_rate": 0.0002, + "loss": 1.6874, + "step": 2180 + }, + { + "epoch": 1.8380193033990768, + "grad_norm": 0.40052926540374756, + "learning_rate": 0.0002, + "loss": 1.7706, + "step": 2190 + }, + { + "epoch": 1.8464120856063784, + "grad_norm": 0.3614487648010254, + "learning_rate": 0.0002, + "loss": 1.7551, + "step": 2200 + }, + { + "epoch": 1.8548048678136801, + "grad_norm": 0.3800305426120758, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2210 + }, + { + "epoch": 1.863197650020982, + "grad_norm": 0.3942040205001831, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 2220 + }, + { + "epoch": 1.8715904322282837, + "grad_norm": 0.36896875500679016, + "learning_rate": 0.0002, + "loss": 1.7187, + "step": 2230 + }, + { + "epoch": 1.8799832144355855, + "grad_norm": 0.3666089177131653, + "learning_rate": 0.0002, + "loss": 1.7371, + "step": 2240 + }, + { + "epoch": 1.8883759966428872, + "grad_norm": 0.3759142756462097, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 2250 + }, + { + "epoch": 1.8967687788501888, + "grad_norm": 0.3711695671081543, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 2260 + }, + { + "epoch": 1.9051615610574906, + "grad_norm": 0.37000006437301636, + "learning_rate": 0.0002, + "loss": 1.7052, + "step": 2270 + }, + { + "epoch": 1.9135543432647921, + "grad_norm": 0.37376025319099426, + "learning_rate": 0.0002, + "loss": 1.7104, + "step": 2280 + }, + { + "epoch": 1.921947125472094, + "grad_norm": 0.3794068694114685, + "learning_rate": 0.0002, + "loss": 1.6641, + "step": 2290 + }, + { + "epoch": 1.9303399076793957, + "grad_norm": 0.42530709505081177, + "learning_rate": 0.0002, + "loss": 1.7693, + "step": 2300 + }, + { + "epoch": 1.9387326898866974, + "grad_norm": 0.3381672203540802, + "learning_rate": 0.0002, + "loss": 1.7871, + "step": 2310 + }, + { + "epoch": 1.9471254720939992, + "grad_norm": 0.3553236722946167, + "learning_rate": 0.0002, + "loss": 1.7502, + "step": 2320 + }, + { + "epoch": 1.955518254301301, + "grad_norm": 0.38204774260520935, + "learning_rate": 0.0002, + "loss": 1.715, + "step": 2330 + }, + { + "epoch": 1.9639110365086025, + "grad_norm": 0.4318946301937103, + "learning_rate": 0.0002, + "loss": 1.7088, + "step": 2340 + }, + { + "epoch": 1.9723038187159043, + "grad_norm": 0.3563119173049927, + "learning_rate": 0.0002, + "loss": 1.7709, + "step": 2350 + }, + { + "epoch": 1.980696600923206, + "grad_norm": 0.362532377243042, + "learning_rate": 0.0002, + "loss": 1.7083, + "step": 2360 + }, + { + "epoch": 1.9890893831305076, + "grad_norm": 0.40200483798980713, + "learning_rate": 0.0002, + "loss": 1.6992, + "step": 2370 + }, + { + "epoch": 1.9974821653378094, + "grad_norm": 0.37397003173828125, + "learning_rate": 0.0002, + "loss": 1.7622, + "step": 2380 + }, + { + "epoch": 2.0, + "eval_loss": 1.807437539100647, + "eval_runtime": 38.0038, + "eval_samples_per_second": 13.551, + "eval_steps_per_second": 1.71, + "step": 2383 + }, + { + "epoch": 2.005874947545111, + "grad_norm": 0.3563518226146698, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 2390 + }, + { + "epoch": 2.014267729752413, + "grad_norm": 0.3913732171058655, + "learning_rate": 0.0002, + "loss": 1.5467, + "step": 2400 + }, + { + "epoch": 2.0226605119597147, + "grad_norm": 0.3511047661304474, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 2410 + }, + { + "epoch": 2.0310532941670165, + "grad_norm": 0.3917897641658783, + "learning_rate": 0.0002, + "loss": 1.599, + "step": 2420 + }, + { + "epoch": 2.0394460763743183, + "grad_norm": 0.36766913533210754, + "learning_rate": 0.0002, + "loss": 1.663, + "step": 2430 + }, + { + "epoch": 2.0478388585816196, + "grad_norm": 0.434097021818161, + "learning_rate": 0.0002, + "loss": 1.5608, + "step": 2440 + }, + { + "epoch": 2.0562316407889214, + "grad_norm": 0.4986756145954132, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 2450 + }, + { + "epoch": 2.064624422996223, + "grad_norm": 0.4377020001411438, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 2460 + }, + { + "epoch": 2.073017205203525, + "grad_norm": 0.4412095546722412, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 2470 + }, + { + "epoch": 2.0814099874108267, + "grad_norm": 0.4463737905025482, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 2480 + }, + { + "epoch": 2.0898027696181285, + "grad_norm": 0.4118853211402893, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 2490 + }, + { + "epoch": 2.0981955518254303, + "grad_norm": 0.48814308643341064, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 2500 + }, + { + "epoch": 2.106588334032732, + "grad_norm": 0.4263038635253906, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2510 + }, + { + "epoch": 2.1149811162400334, + "grad_norm": 0.41060999035835266, + "learning_rate": 0.0002, + "loss": 1.5907, + "step": 2520 + }, + { + "epoch": 2.123373898447335, + "grad_norm": 0.4699285626411438, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 2530 + }, + { + "epoch": 2.131766680654637, + "grad_norm": 0.4321298897266388, + "learning_rate": 0.0002, + "loss": 1.6076, + "step": 2540 + }, + { + "epoch": 2.1401594628619387, + "grad_norm": 0.41544368863105774, + "learning_rate": 0.0002, + "loss": 1.5715, + "step": 2550 + }, + { + "epoch": 2.1485522450692405, + "grad_norm": 0.4529191851615906, + "learning_rate": 0.0002, + "loss": 1.6717, + "step": 2560 + }, + { + "epoch": 2.1569450272765422, + "grad_norm": 0.4370215833187103, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2570 + }, + { + "epoch": 2.165337809483844, + "grad_norm": 0.3878629207611084, + "learning_rate": 0.0002, + "loss": 1.55, + "step": 2580 + }, + { + "epoch": 2.173730591691146, + "grad_norm": 0.47374191880226135, + "learning_rate": 0.0002, + "loss": 1.6863, + "step": 2590 + }, + { + "epoch": 2.182123373898447, + "grad_norm": 0.4551556706428528, + "learning_rate": 0.0002, + "loss": 1.6462, + "step": 2600 + }, + { + "epoch": 2.190516156105749, + "grad_norm": 0.45371633768081665, + "learning_rate": 0.0002, + "loss": 1.6238, + "step": 2610 + }, + { + "epoch": 2.1989089383130507, + "grad_norm": 0.3831859529018402, + "learning_rate": 0.0002, + "loss": 1.6134, + "step": 2620 + }, + { + "epoch": 2.2073017205203525, + "grad_norm": 0.42436569929122925, + "learning_rate": 0.0002, + "loss": 1.6477, + "step": 2630 + }, + { + "epoch": 2.2156945027276542, + "grad_norm": 0.4363750219345093, + "learning_rate": 0.0002, + "loss": 1.6512, + "step": 2640 + }, + { + "epoch": 2.224087284934956, + "grad_norm": 0.4473390579223633, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 2650 + }, + { + "epoch": 2.2324800671422578, + "grad_norm": 0.4419533908367157, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 2660 + }, + { + "epoch": 2.2408728493495595, + "grad_norm": 0.525901198387146, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 2670 + }, + { + "epoch": 2.2492656315568613, + "grad_norm": 0.4345211684703827, + "learning_rate": 0.0002, + "loss": 1.6891, + "step": 2680 + }, + { + "epoch": 2.2576584137641627, + "grad_norm": 0.5169841051101685, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 2690 + }, + { + "epoch": 2.2660511959714644, + "grad_norm": 0.43511003255844116, + "learning_rate": 0.0002, + "loss": 1.6221, + "step": 2700 + }, + { + "epoch": 2.274443978178766, + "grad_norm": 0.4781411588191986, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 2710 + }, + { + "epoch": 2.282836760386068, + "grad_norm": 0.4282242953777313, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 2720 + }, + { + "epoch": 2.2912295425933698, + "grad_norm": 0.4499875605106354, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 2730 + }, + { + "epoch": 2.2996223248006715, + "grad_norm": 0.4133218824863434, + "learning_rate": 0.0002, + "loss": 1.5844, + "step": 2740 + }, + { + "epoch": 2.3080151070079733, + "grad_norm": 0.4706156849861145, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2750 + }, + { + "epoch": 2.3164078892152746, + "grad_norm": 0.4537484347820282, + "learning_rate": 0.0002, + "loss": 1.573, + "step": 2760 + }, + { + "epoch": 2.3248006714225764, + "grad_norm": 0.39736735820770264, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2770 + }, + { + "epoch": 2.333193453629878, + "grad_norm": 0.4488453269004822, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2780 + }, + { + "epoch": 2.34158623583718, + "grad_norm": 0.44405487179756165, + "learning_rate": 0.0002, + "loss": 1.6169, + "step": 2790 + }, + { + "epoch": 2.3499790180444817, + "grad_norm": 0.4726555049419403, + "learning_rate": 0.0002, + "loss": 1.5207, + "step": 2800 + }, + { + "epoch": 2.3583718002517835, + "grad_norm": 0.4820375442504883, + "learning_rate": 0.0002, + "loss": 1.5792, + "step": 2810 + }, + { + "epoch": 2.3667645824590853, + "grad_norm": 0.46176597476005554, + "learning_rate": 0.0002, + "loss": 1.5774, + "step": 2820 + }, + { + "epoch": 2.375157364666387, + "grad_norm": 0.4603394567966461, + "learning_rate": 0.0002, + "loss": 1.6256, + "step": 2830 + }, + { + "epoch": 2.383550146873689, + "grad_norm": 0.4462946355342865, + "learning_rate": 0.0002, + "loss": 1.6598, + "step": 2840 + }, + { + "epoch": 2.39194292908099, + "grad_norm": 0.5216080546379089, + "learning_rate": 0.0002, + "loss": 1.5939, + "step": 2850 + }, + { + "epoch": 2.400335711288292, + "grad_norm": 0.44553086161613464, + "learning_rate": 0.0002, + "loss": 1.5981, + "step": 2860 + }, + { + "epoch": 2.4087284934955937, + "grad_norm": 0.4215725362300873, + "learning_rate": 0.0002, + "loss": 1.6556, + "step": 2870 + }, + { + "epoch": 2.4171212757028955, + "grad_norm": 0.4646450877189636, + "learning_rate": 0.0002, + "loss": 1.6228, + "step": 2880 + }, + { + "epoch": 2.4255140579101973, + "grad_norm": 0.44749370217323303, + "learning_rate": 0.0002, + "loss": 1.6547, + "step": 2890 + }, + { + "epoch": 2.433906840117499, + "grad_norm": 0.4986693859100342, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2900 + }, + { + "epoch": 2.442299622324801, + "grad_norm": 0.4607609808444977, + "learning_rate": 0.0002, + "loss": 1.6294, + "step": 2910 + }, + { + "epoch": 2.4506924045321026, + "grad_norm": 0.4597654938697815, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 2920 + }, + { + "epoch": 2.4590851867394043, + "grad_norm": 0.4106820821762085, + "learning_rate": 0.0002, + "loss": 1.7428, + "step": 2930 + }, + { + "epoch": 2.4674779689467057, + "grad_norm": 0.4531514048576355, + "learning_rate": 0.0002, + "loss": 1.622, + "step": 2940 + }, + { + "epoch": 2.4758707511540075, + "grad_norm": 0.4546769857406616, + "learning_rate": 0.0002, + "loss": 1.6367, + "step": 2950 + }, + { + "epoch": 2.4842635333613092, + "grad_norm": 0.47410622239112854, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 2960 + }, + { + "epoch": 2.492656315568611, + "grad_norm": 0.4498177468776703, + "learning_rate": 0.0002, + "loss": 1.6597, + "step": 2970 + }, + { + "epoch": 2.5010490977759128, + "grad_norm": 0.47267791628837585, + "learning_rate": 0.0002, + "loss": 1.6845, + "step": 2980 + }, + { + "epoch": 2.5094418799832146, + "grad_norm": 0.4340207576751709, + "learning_rate": 0.0002, + "loss": 1.601, + "step": 2990 + }, + { + "epoch": 2.5178346621905163, + "grad_norm": 0.43454936146736145, + "learning_rate": 0.0002, + "loss": 1.5783, + "step": 3000 + }, + { + "epoch": 2.5262274443978177, + "grad_norm": 0.43459394574165344, + "learning_rate": 0.0002, + "loss": 1.5773, + "step": 3010 + }, + { + "epoch": 2.5346202266051194, + "grad_norm": 0.4716770052909851, + "learning_rate": 0.0002, + "loss": 1.6376, + "step": 3020 + }, + { + "epoch": 2.543013008812421, + "grad_norm": 0.4339194595813751, + "learning_rate": 0.0002, + "loss": 1.626, + "step": 3030 + }, + { + "epoch": 2.551405791019723, + "grad_norm": 0.4655593931674957, + "learning_rate": 0.0002, + "loss": 1.6053, + "step": 3040 + }, + { + "epoch": 2.5597985732270248, + "grad_norm": 0.5480475425720215, + "learning_rate": 0.0002, + "loss": 1.5871, + "step": 3050 + }, + { + "epoch": 2.5681913554343265, + "grad_norm": 0.4783174991607666, + "learning_rate": 0.0002, + "loss": 1.7056, + "step": 3060 + }, + { + "epoch": 2.5765841376416283, + "grad_norm": 0.45062026381492615, + "learning_rate": 0.0002, + "loss": 1.5691, + "step": 3070 + }, + { + "epoch": 2.58497691984893, + "grad_norm": 0.4559392035007477, + "learning_rate": 0.0002, + "loss": 1.7005, + "step": 3080 + }, + { + "epoch": 2.593369702056232, + "grad_norm": 0.6581618785858154, + "learning_rate": 0.0002, + "loss": 1.6414, + "step": 3090 + }, + { + "epoch": 2.601762484263533, + "grad_norm": 0.48549333214759827, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 3100 + }, + { + "epoch": 2.610155266470835, + "grad_norm": 0.5358436107635498, + "learning_rate": 0.0002, + "loss": 1.6128, + "step": 3110 + }, + { + "epoch": 2.6185480486781367, + "grad_norm": 0.5380043983459473, + "learning_rate": 0.0002, + "loss": 1.6507, + "step": 3120 + }, + { + "epoch": 2.6269408308854385, + "grad_norm": 0.49887847900390625, + "learning_rate": 0.0002, + "loss": 1.6394, + "step": 3130 + }, + { + "epoch": 2.6353336130927403, + "grad_norm": 0.46039602160453796, + "learning_rate": 0.0002, + "loss": 1.6464, + "step": 3140 + }, + { + "epoch": 2.643726395300042, + "grad_norm": 0.416098952293396, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3150 + }, + { + "epoch": 2.652119177507344, + "grad_norm": 0.465326726436615, + "learning_rate": 0.0002, + "loss": 1.6295, + "step": 3160 + }, + { + "epoch": 2.660511959714645, + "grad_norm": 0.47029924392700195, + "learning_rate": 0.0002, + "loss": 1.5806, + "step": 3170 + }, + { + "epoch": 2.6689047419219474, + "grad_norm": 0.5063307285308838, + "learning_rate": 0.0002, + "loss": 1.6268, + "step": 3180 + }, + { + "epoch": 2.6772975241292487, + "grad_norm": 0.42928868532180786, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3190 + }, + { + "epoch": 2.6856903063365505, + "grad_norm": 0.4170134365558624, + "learning_rate": 0.0002, + "loss": 1.6113, + "step": 3200 + }, + { + "epoch": 2.6940830885438523, + "grad_norm": 0.47810474038124084, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 3210 + }, + { + "epoch": 2.702475870751154, + "grad_norm": 0.44440609216690063, + "learning_rate": 0.0002, + "loss": 1.6808, + "step": 3220 + }, + { + "epoch": 2.710868652958456, + "grad_norm": 0.482759565114975, + "learning_rate": 0.0002, + "loss": 1.5611, + "step": 3230 + }, + { + "epoch": 2.7192614351657576, + "grad_norm": 0.4325942099094391, + "learning_rate": 0.0002, + "loss": 1.6265, + "step": 3240 + }, + { + "epoch": 2.7276542173730594, + "grad_norm": 0.502498984336853, + "learning_rate": 0.0002, + "loss": 1.585, + "step": 3250 + }, + { + "epoch": 2.7360469995803607, + "grad_norm": 0.4725162982940674, + "learning_rate": 0.0002, + "loss": 1.7179, + "step": 3260 + }, + { + "epoch": 2.7444397817876625, + "grad_norm": 0.46781349182128906, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 3270 + }, + { + "epoch": 2.7528325639949642, + "grad_norm": 0.47366851568222046, + "learning_rate": 0.0002, + "loss": 1.6625, + "step": 3280 + }, + { + "epoch": 2.761225346202266, + "grad_norm": 0.5101882815361023, + "learning_rate": 0.0002, + "loss": 1.6437, + "step": 3290 + }, + { + "epoch": 2.769618128409568, + "grad_norm": 0.4874587059020996, + "learning_rate": 0.0002, + "loss": 1.6488, + "step": 3300 + }, + { + "epoch": 2.7780109106168696, + "grad_norm": 0.4989369213581085, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 3310 + }, + { + "epoch": 2.7864036928241713, + "grad_norm": 0.48041442036628723, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 3320 + }, + { + "epoch": 2.7947964750314727, + "grad_norm": 0.4845651090145111, + "learning_rate": 0.0002, + "loss": 1.6137, + "step": 3330 + }, + { + "epoch": 2.803189257238775, + "grad_norm": 0.48575496673583984, + "learning_rate": 0.0002, + "loss": 1.7154, + "step": 3340 + }, + { + "epoch": 2.811582039446076, + "grad_norm": 0.509726881980896, + "learning_rate": 0.0002, + "loss": 1.6771, + "step": 3350 + }, + { + "epoch": 2.819974821653378, + "grad_norm": 0.5026665329933167, + "learning_rate": 0.0002, + "loss": 1.6937, + "step": 3360 + }, + { + "epoch": 2.8283676038606798, + "grad_norm": 0.4727601706981659, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 3370 + }, + { + "epoch": 2.8367603860679815, + "grad_norm": 0.41952234506607056, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 3380 + }, + { + "epoch": 2.8451531682752833, + "grad_norm": 0.49663856625556946, + "learning_rate": 0.0002, + "loss": 1.6639, + "step": 3390 + }, + { + "epoch": 2.853545950482585, + "grad_norm": 0.4934511184692383, + "learning_rate": 0.0002, + "loss": 1.6389, + "step": 3400 + }, + { + "epoch": 2.861938732689887, + "grad_norm": 0.4673226773738861, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3410 + }, + { + "epoch": 2.870331514897188, + "grad_norm": 0.48972779512405396, + "learning_rate": 0.0002, + "loss": 1.641, + "step": 3420 + }, + { + "epoch": 2.8787242971044904, + "grad_norm": 0.5008330345153809, + "learning_rate": 0.0002, + "loss": 1.6047, + "step": 3430 + }, + { + "epoch": 2.8871170793117917, + "grad_norm": 0.43337664008140564, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 3440 + }, + { + "epoch": 2.8955098615190935, + "grad_norm": 0.4430622458457947, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 3450 + }, + { + "epoch": 2.9039026437263953, + "grad_norm": 0.45123326778411865, + "learning_rate": 0.0002, + "loss": 1.6415, + "step": 3460 + }, + { + "epoch": 2.912295425933697, + "grad_norm": 0.47367340326309204, + "learning_rate": 0.0002, + "loss": 1.5913, + "step": 3470 + }, + { + "epoch": 2.920688208140999, + "grad_norm": 0.44940701127052307, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 3480 + }, + { + "epoch": 2.9290809903483006, + "grad_norm": 0.44216281175613403, + "learning_rate": 0.0002, + "loss": 1.6343, + "step": 3490 + }, + { + "epoch": 2.9374737725556024, + "grad_norm": 0.4824782609939575, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 3500 + }, + { + "epoch": 2.9458665547629037, + "grad_norm": 0.43067067861557007, + "learning_rate": 0.0002, + "loss": 1.5949, + "step": 3510 + }, + { + "epoch": 2.9542593369702055, + "grad_norm": 0.46483176946640015, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 3520 + }, + { + "epoch": 2.9626521191775073, + "grad_norm": 0.49230799078941345, + "learning_rate": 0.0002, + "loss": 1.5878, + "step": 3530 + }, + { + "epoch": 2.971044901384809, + "grad_norm": 0.5081011652946472, + "learning_rate": 0.0002, + "loss": 1.5925, + "step": 3540 + }, + { + "epoch": 2.979437683592111, + "grad_norm": 0.5326072573661804, + "learning_rate": 0.0002, + "loss": 1.7402, + "step": 3550 + }, + { + "epoch": 2.9878304657994126, + "grad_norm": 0.4981454014778137, + "learning_rate": 0.0002, + "loss": 1.5769, + "step": 3560 + }, + { + "epoch": 2.9962232480067144, + "grad_norm": 0.4330528676509857, + "learning_rate": 0.0002, + "loss": 1.6073, + "step": 3570 + }, + { + "epoch": 2.999580360889635, + "eval_loss": 1.824695348739624, + "eval_runtime": 37.947, + "eval_samples_per_second": 13.572, + "eval_steps_per_second": 1.713, + "step": 3574 + }, + { + "epoch": 3.004616030214016, + "grad_norm": 0.4380604326725006, + "learning_rate": 0.0002, + "loss": 1.5633, + "step": 3580 + }, + { + "epoch": 3.0130088124213175, + "grad_norm": 0.5375564098358154, + "learning_rate": 0.0002, + "loss": 1.4474, + "step": 3590 + }, + { + "epoch": 3.0214015946286192, + "grad_norm": 0.50722736120224, + "learning_rate": 0.0002, + "loss": 1.5738, + "step": 3600 + }, + { + "epoch": 3.029794376835921, + "grad_norm": 0.5398766994476318, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 3610 + }, + { + "epoch": 3.038187159043223, + "grad_norm": 0.520709753036499, + "learning_rate": 0.0002, + "loss": 1.4401, + "step": 3620 + }, + { + "epoch": 3.0465799412505246, + "grad_norm": 0.5429664850234985, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 3630 + }, + { + "epoch": 3.0549727234578263, + "grad_norm": 0.5634943842887878, + "learning_rate": 0.0002, + "loss": 1.5516, + "step": 3640 + }, + { + "epoch": 3.063365505665128, + "grad_norm": 0.5042277574539185, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 3650 + }, + { + "epoch": 3.07175828787243, + "grad_norm": 0.5778711438179016, + "learning_rate": 0.0002, + "loss": 1.4708, + "step": 3660 + }, + { + "epoch": 3.080151070079731, + "grad_norm": 0.5504926443099976, + "learning_rate": 0.0002, + "loss": 1.5196, + "step": 3670 + }, + { + "epoch": 3.088543852287033, + "grad_norm": 0.5199463963508606, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3680 + }, + { + "epoch": 3.0969366344943348, + "grad_norm": 0.552334189414978, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 3690 + }, + { + "epoch": 3.1053294167016365, + "grad_norm": 0.5650873780250549, + "learning_rate": 0.0002, + "loss": 1.4638, + "step": 3700 + }, + { + "epoch": 3.1137221989089383, + "grad_norm": 0.6292349696159363, + "learning_rate": 0.0002, + "loss": 1.4945, + "step": 3710 + }, + { + "epoch": 3.12211498111624, + "grad_norm": 0.5523604154586792, + "learning_rate": 0.0002, + "loss": 1.4787, + "step": 3720 + }, + { + "epoch": 3.130507763323542, + "grad_norm": 0.6160100698471069, + "learning_rate": 0.0002, + "loss": 1.4697, + "step": 3730 + }, + { + "epoch": 3.1389005455308436, + "grad_norm": 0.6091629266738892, + "learning_rate": 0.0002, + "loss": 1.5589, + "step": 3740 + }, + { + "epoch": 3.1472933277381454, + "grad_norm": 0.5695531964302063, + "learning_rate": 0.0002, + "loss": 1.4659, + "step": 3750 + }, + { + "epoch": 3.1556861099454467, + "grad_norm": 0.569611132144928, + "learning_rate": 0.0002, + "loss": 1.4605, + "step": 3760 + }, + { + "epoch": 3.1640788921527485, + "grad_norm": 0.5761140584945679, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 3770 + }, + { + "epoch": 3.1724716743600503, + "grad_norm": 0.6855548620223999, + "learning_rate": 0.0002, + "loss": 1.4999, + "step": 3780 + }, + { + "epoch": 3.180864456567352, + "grad_norm": 0.5815101265907288, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 3790 + }, + { + "epoch": 3.189257238774654, + "grad_norm": 0.6179960370063782, + "learning_rate": 0.0002, + "loss": 1.5289, + "step": 3800 + }, + { + "epoch": 3.1976500209819556, + "grad_norm": 0.5418674349784851, + "learning_rate": 0.0002, + "loss": 1.4833, + "step": 3810 + }, + { + "epoch": 3.2060428031892574, + "grad_norm": 0.5655816197395325, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 3820 + }, + { + "epoch": 3.214435585396559, + "grad_norm": 0.7279291152954102, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 3830 + }, + { + "epoch": 3.2228283676038605, + "grad_norm": 0.490998238325119, + "learning_rate": 0.0002, + "loss": 1.5672, + "step": 3840 + }, + { + "epoch": 3.2312211498111623, + "grad_norm": 0.6065797209739685, + "learning_rate": 0.0002, + "loss": 1.4683, + "step": 3850 + }, + { + "epoch": 3.239613932018464, + "grad_norm": 0.6024682521820068, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 3860 + }, + { + "epoch": 3.248006714225766, + "grad_norm": 0.5571125745773315, + "learning_rate": 0.0002, + "loss": 1.5123, + "step": 3870 + }, + { + "epoch": 3.2563994964330676, + "grad_norm": 0.5662134289741516, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3880 + }, + { + "epoch": 3.2647922786403694, + "grad_norm": 0.5936661958694458, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 3890 + }, + { + "epoch": 3.273185060847671, + "grad_norm": 0.6739671230316162, + "learning_rate": 0.0002, + "loss": 1.5149, + "step": 3900 + }, + { + "epoch": 3.281577843054973, + "grad_norm": 0.5579532384872437, + "learning_rate": 0.0002, + "loss": 1.5101, + "step": 3910 + }, + { + "epoch": 3.2899706252622742, + "grad_norm": 0.6595954298973083, + "learning_rate": 0.0002, + "loss": 1.4788, + "step": 3920 + }, + { + "epoch": 3.298363407469576, + "grad_norm": 0.5712262988090515, + "learning_rate": 0.0002, + "loss": 1.473, + "step": 3930 + }, + { + "epoch": 3.306756189676878, + "grad_norm": 0.5601761341094971, + "learning_rate": 0.0002, + "loss": 1.5512, + "step": 3940 + }, + { + "epoch": 3.3151489718841796, + "grad_norm": 0.5759967565536499, + "learning_rate": 0.0002, + "loss": 1.4904, + "step": 3950 + }, + { + "epoch": 3.3235417540914813, + "grad_norm": 0.6543047428131104, + "learning_rate": 0.0002, + "loss": 1.4885, + "step": 3960 + }, + { + "epoch": 3.331934536298783, + "grad_norm": 0.6355253458023071, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 3970 + }, + { + "epoch": 3.340327318506085, + "grad_norm": 0.5671007633209229, + "learning_rate": 0.0002, + "loss": 1.5025, + "step": 3980 + }, + { + "epoch": 3.3487201007133867, + "grad_norm": 0.6743636727333069, + "learning_rate": 0.0002, + "loss": 1.5049, + "step": 3990 + }, + { + "epoch": 3.3571128829206884, + "grad_norm": 0.500627338886261, + "learning_rate": 0.0002, + "loss": 1.5527, + "step": 4000 + }, + { + "epoch": 3.3655056651279898, + "grad_norm": 0.5666340589523315, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4010 + }, + { + "epoch": 3.3738984473352915, + "grad_norm": 0.5651408433914185, + "learning_rate": 0.0002, + "loss": 1.5104, + "step": 4020 + }, + { + "epoch": 3.3822912295425933, + "grad_norm": 0.6338897943496704, + "learning_rate": 0.0002, + "loss": 1.4907, + "step": 4030 + }, + { + "epoch": 3.390684011749895, + "grad_norm": 0.5781935453414917, + "learning_rate": 0.0002, + "loss": 1.553, + "step": 4040 + }, + { + "epoch": 3.399076793957197, + "grad_norm": 0.55543053150177, + "learning_rate": 0.0002, + "loss": 1.5535, + "step": 4050 + }, + { + "epoch": 3.4074695761644986, + "grad_norm": 0.6602614521980286, + "learning_rate": 0.0002, + "loss": 1.4884, + "step": 4060 + }, + { + "epoch": 3.4158623583718004, + "grad_norm": 0.5514156222343445, + "learning_rate": 0.0002, + "loss": 1.471, + "step": 4070 + }, + { + "epoch": 3.4242551405791017, + "grad_norm": 0.5760560035705566, + "learning_rate": 0.0002, + "loss": 1.4634, + "step": 4080 + }, + { + "epoch": 3.4326479227864035, + "grad_norm": 0.657503604888916, + "learning_rate": 0.0002, + "loss": 1.4662, + "step": 4090 + }, + { + "epoch": 3.4410407049937053, + "grad_norm": 0.5746736526489258, + "learning_rate": 0.0002, + "loss": 1.5041, + "step": 4100 + }, + { + "epoch": 3.449433487201007, + "grad_norm": 0.5988999009132385, + "learning_rate": 0.0002, + "loss": 1.4387, + "step": 4110 + }, + { + "epoch": 3.457826269408309, + "grad_norm": 0.7294586300849915, + "learning_rate": 0.0002, + "loss": 1.5475, + "step": 4120 + }, + { + "epoch": 3.4662190516156106, + "grad_norm": 0.6391161680221558, + "learning_rate": 0.0002, + "loss": 1.4878, + "step": 4130 + }, + { + "epoch": 3.4746118338229124, + "grad_norm": 0.6416470408439636, + "learning_rate": 0.0002, + "loss": 1.5366, + "step": 4140 + }, + { + "epoch": 3.483004616030214, + "grad_norm": 0.5710626244544983, + "learning_rate": 0.0002, + "loss": 1.5587, + "step": 4150 + }, + { + "epoch": 3.491397398237516, + "grad_norm": 0.5370054841041565, + "learning_rate": 0.0002, + "loss": 1.4661, + "step": 4160 + }, + { + "epoch": 3.4997901804448173, + "grad_norm": 0.5559558272361755, + "learning_rate": 0.0002, + "loss": 1.5167, + "step": 4170 + }, + { + "epoch": 3.508182962652119, + "grad_norm": 0.5426168441772461, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 4180 + }, + { + "epoch": 3.516575744859421, + "grad_norm": 0.5997438430786133, + "learning_rate": 0.0002, + "loss": 1.5241, + "step": 4190 + }, + { + "epoch": 3.5249685270667226, + "grad_norm": 0.5399143099784851, + "learning_rate": 0.0002, + "loss": 1.6091, + "step": 4200 + }, + { + "epoch": 3.5333613092740244, + "grad_norm": 0.6341416239738464, + "learning_rate": 0.0002, + "loss": 1.5066, + "step": 4210 + }, + { + "epoch": 3.541754091481326, + "grad_norm": 0.632238507270813, + "learning_rate": 0.0002, + "loss": 1.5436, + "step": 4220 + }, + { + "epoch": 3.550146873688628, + "grad_norm": 0.6356478333473206, + "learning_rate": 0.0002, + "loss": 1.5423, + "step": 4230 + }, + { + "epoch": 3.5585396558959292, + "grad_norm": 0.6379408240318298, + "learning_rate": 0.0002, + "loss": 1.483, + "step": 4240 + }, + { + "epoch": 3.5669324381032315, + "grad_norm": 0.6265586018562317, + "learning_rate": 0.0002, + "loss": 1.5184, + "step": 4250 + }, + { + "epoch": 3.575325220310533, + "grad_norm": 0.5378820896148682, + "learning_rate": 0.0002, + "loss": 1.5047, + "step": 4260 + }, + { + "epoch": 3.5837180025178346, + "grad_norm": 0.6800801753997803, + "learning_rate": 0.0002, + "loss": 1.5668, + "step": 4270 + }, + { + "epoch": 3.5921107847251363, + "grad_norm": 0.5653113126754761, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4280 + }, + { + "epoch": 3.600503566932438, + "grad_norm": 0.548647940158844, + "learning_rate": 0.0002, + "loss": 1.5007, + "step": 4290 + }, + { + "epoch": 3.60889634913974, + "grad_norm": 0.5729944705963135, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 4300 + }, + { + "epoch": 3.6172891313470417, + "grad_norm": 0.6204999685287476, + "learning_rate": 0.0002, + "loss": 1.575, + "step": 4310 + }, + { + "epoch": 3.6256819135543434, + "grad_norm": 0.6275812983512878, + "learning_rate": 0.0002, + "loss": 1.5107, + "step": 4320 + }, + { + "epoch": 3.6340746957616448, + "grad_norm": 0.7261835336685181, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 4330 + }, + { + "epoch": 3.6424674779689465, + "grad_norm": 0.6048004627227783, + "learning_rate": 0.0002, + "loss": 1.5128, + "step": 4340 + }, + { + "epoch": 3.6508602601762483, + "grad_norm": 0.5879671573638916, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 4350 + }, + { + "epoch": 3.65925304238355, + "grad_norm": 0.6001018285751343, + "learning_rate": 0.0002, + "loss": 1.5477, + "step": 4360 + }, + { + "epoch": 3.667645824590852, + "grad_norm": 0.6468151211738586, + "learning_rate": 0.0002, + "loss": 1.5247, + "step": 4370 + }, + { + "epoch": 3.6760386067981536, + "grad_norm": 0.6342051029205322, + "learning_rate": 0.0002, + "loss": 1.563, + "step": 4380 + }, + { + "epoch": 3.6844313890054554, + "grad_norm": 0.6078384518623352, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 4390 + }, + { + "epoch": 3.692824171212757, + "grad_norm": 0.5555588006973267, + "learning_rate": 0.0002, + "loss": 1.5546, + "step": 4400 + }, + { + "epoch": 3.701216953420059, + "grad_norm": 0.6089665293693542, + "learning_rate": 0.0002, + "loss": 1.5694, + "step": 4410 + }, + { + "epoch": 3.7096097356273603, + "grad_norm": 0.6225191950798035, + "learning_rate": 0.0002, + "loss": 1.5898, + "step": 4420 + }, + { + "epoch": 3.718002517834662, + "grad_norm": 0.5642715692520142, + "learning_rate": 0.0002, + "loss": 1.5153, + "step": 4430 + }, + { + "epoch": 3.726395300041964, + "grad_norm": 0.5703449845314026, + "learning_rate": 0.0002, + "loss": 1.5057, + "step": 4440 + }, + { + "epoch": 3.7347880822492656, + "grad_norm": 0.6029745936393738, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 4450 + }, + { + "epoch": 3.7431808644565674, + "grad_norm": 0.7089189887046814, + "learning_rate": 0.0002, + "loss": 1.5044, + "step": 4460 + }, + { + "epoch": 3.751573646663869, + "grad_norm": 0.6230936050415039, + "learning_rate": 0.0002, + "loss": 1.4804, + "step": 4470 + }, + { + "epoch": 3.759966428871171, + "grad_norm": 0.5718494653701782, + "learning_rate": 0.0002, + "loss": 1.567, + "step": 4480 + }, + { + "epoch": 3.7683592110784723, + "grad_norm": 0.5404117703437805, + "learning_rate": 0.0002, + "loss": 1.5612, + "step": 4490 + }, + { + "epoch": 3.7767519932857745, + "grad_norm": 0.5816529393196106, + "learning_rate": 0.0002, + "loss": 1.4707, + "step": 4500 + }, + { + "epoch": 3.785144775493076, + "grad_norm": 0.6314901113510132, + "learning_rate": 0.0002, + "loss": 1.5802, + "step": 4510 + }, + { + "epoch": 3.7935375577003776, + "grad_norm": 0.7639698386192322, + "learning_rate": 0.0002, + "loss": 1.5445, + "step": 4520 + }, + { + "epoch": 3.8019303399076794, + "grad_norm": 0.5727366209030151, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4530 + }, + { + "epoch": 3.810323122114981, + "grad_norm": 0.6467128396034241, + "learning_rate": 0.0002, + "loss": 1.5409, + "step": 4540 + }, + { + "epoch": 3.818715904322283, + "grad_norm": 0.6572837233543396, + "learning_rate": 0.0002, + "loss": 1.5266, + "step": 4550 + }, + { + "epoch": 3.8271086865295847, + "grad_norm": 0.5847418904304504, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 4560 + }, + { + "epoch": 3.8355014687368865, + "grad_norm": 0.48820871114730835, + "learning_rate": 0.0002, + "loss": 1.5303, + "step": 4570 + }, + { + "epoch": 3.843894250944188, + "grad_norm": 1.2537429332733154, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 4580 + }, + { + "epoch": 3.8522870331514896, + "grad_norm": 0.6026989221572876, + "learning_rate": 0.0002, + "loss": 1.5522, + "step": 4590 + }, + { + "epoch": 3.8606798153587913, + "grad_norm": 0.5541417598724365, + "learning_rate": 0.0002, + "loss": 1.5035, + "step": 4600 + }, + { + "epoch": 3.869072597566093, + "grad_norm": 0.7668771147727966, + "learning_rate": 0.0002, + "loss": 1.5238, + "step": 4610 + }, + { + "epoch": 3.877465379773395, + "grad_norm": 0.6181227564811707, + "learning_rate": 0.0002, + "loss": 1.5428, + "step": 4620 + }, + { + "epoch": 3.8858581619806967, + "grad_norm": 0.5842700004577637, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 4630 + }, + { + "epoch": 3.8942509441879984, + "grad_norm": 0.5824751257896423, + "learning_rate": 0.0002, + "loss": 1.5501, + "step": 4640 + }, + { + "epoch": 3.9026437263952998, + "grad_norm": 0.6212735772132874, + "learning_rate": 0.0002, + "loss": 1.4443, + "step": 4650 + }, + { + "epoch": 3.911036508602602, + "grad_norm": 0.6123346090316772, + "learning_rate": 0.0002, + "loss": 1.4972, + "step": 4660 + }, + { + "epoch": 3.9194292908099033, + "grad_norm": 0.518662691116333, + "learning_rate": 0.0002, + "loss": 1.5531, + "step": 4670 + }, + { + "epoch": 3.927822073017205, + "grad_norm": 0.6963476538658142, + "learning_rate": 0.0002, + "loss": 1.5151, + "step": 4680 + }, + { + "epoch": 3.936214855224507, + "grad_norm": 0.5192152261734009, + "learning_rate": 0.0002, + "loss": 1.5826, + "step": 4690 + }, + { + "epoch": 3.9446076374318086, + "grad_norm": 0.5820888876914978, + "learning_rate": 0.0002, + "loss": 1.5312, + "step": 4700 + }, + { + "epoch": 3.9530004196391104, + "grad_norm": 0.6320387721061707, + "learning_rate": 0.0002, + "loss": 1.527, + "step": 4710 + }, + { + "epoch": 3.961393201846412, + "grad_norm": 0.6174548268318176, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 4720 + }, + { + "epoch": 3.969785984053714, + "grad_norm": 0.6691966652870178, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 4730 + }, + { + "epoch": 3.9781787662610153, + "grad_norm": 0.5972068309783936, + "learning_rate": 0.0002, + "loss": 1.4762, + "step": 4740 + }, + { + "epoch": 3.9865715484683175, + "grad_norm": 0.5759536027908325, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 4750 + }, + { + "epoch": 3.994964330675619, + "grad_norm": 0.5886756777763367, + "learning_rate": 0.0002, + "loss": 1.4836, + "step": 4760 + }, + { + "epoch": 4.0, + "eval_loss": 1.8749940395355225, + "eval_runtime": 38.037, + "eval_samples_per_second": 13.539, + "eval_steps_per_second": 1.709, + "step": 4766 + }, + { + "epoch": 4.003357112882921, + "grad_norm": 0.5915011167526245, + "learning_rate": 0.0002, + "loss": 1.5259, + "step": 4770 + }, + { + "epoch": 4.011749895090222, + "grad_norm": 0.8565000891685486, + "learning_rate": 0.0002, + "loss": 1.4071, + "step": 4780 + }, + { + "epoch": 4.020142677297524, + "grad_norm": 0.7753950953483582, + "learning_rate": 0.0002, + "loss": 1.3211, + "step": 4790 + }, + { + "epoch": 4.028535459504826, + "grad_norm": 0.6837254166603088, + "learning_rate": 0.0002, + "loss": 1.3607, + "step": 4800 + }, + { + "epoch": 4.036928241712127, + "grad_norm": 0.8374526500701904, + "learning_rate": 0.0002, + "loss": 1.3275, + "step": 4810 + }, + { + "epoch": 4.0453210239194295, + "grad_norm": 0.8717963099479675, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 4820 + }, + { + "epoch": 4.053713806126731, + "grad_norm": 0.7002043724060059, + "learning_rate": 0.0002, + "loss": 1.3374, + "step": 4830 + }, + { + "epoch": 4.062106588334033, + "grad_norm": 1.0319572687149048, + "learning_rate": 0.0002, + "loss": 1.3882, + "step": 4840 + }, + { + "epoch": 4.070499370541334, + "grad_norm": 0.6746882200241089, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 4850 + }, + { + "epoch": 4.078892152748637, + "grad_norm": 0.8187578320503235, + "learning_rate": 0.0002, + "loss": 1.339, + "step": 4860 + }, + { + "epoch": 4.087284934955938, + "grad_norm": 0.7888399362564087, + "learning_rate": 0.0002, + "loss": 1.368, + "step": 4870 + }, + { + "epoch": 4.095677717163239, + "grad_norm": 0.7149351239204407, + "learning_rate": 0.0002, + "loss": 1.4115, + "step": 4880 + }, + { + "epoch": 4.1040704993705415, + "grad_norm": 0.9067983031272888, + "learning_rate": 0.0002, + "loss": 1.341, + "step": 4890 + }, + { + "epoch": 4.112463281577843, + "grad_norm": 0.771186351776123, + "learning_rate": 0.0002, + "loss": 1.4084, + "step": 4900 + }, + { + "epoch": 4.120856063785145, + "grad_norm": 0.7756485342979431, + "learning_rate": 0.0002, + "loss": 1.2722, + "step": 4910 + }, + { + "epoch": 4.129248845992446, + "grad_norm": 0.7149116396903992, + "learning_rate": 0.0002, + "loss": 1.4138, + "step": 4920 + }, + { + "epoch": 4.137641628199749, + "grad_norm": 0.700442910194397, + "learning_rate": 0.0002, + "loss": 1.3102, + "step": 4930 + }, + { + "epoch": 4.14603441040705, + "grad_norm": 0.8439189195632935, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 4940 + }, + { + "epoch": 4.154427192614351, + "grad_norm": 0.6570779085159302, + "learning_rate": 0.0002, + "loss": 1.3511, + "step": 4950 + }, + { + "epoch": 4.1628199748216534, + "grad_norm": 0.886482298374176, + "learning_rate": 0.0002, + "loss": 1.3955, + "step": 4960 + }, + { + "epoch": 4.171212757028955, + "grad_norm": 0.7220938801765442, + "learning_rate": 0.0002, + "loss": 1.4083, + "step": 4970 + }, + { + "epoch": 4.179605539236257, + "grad_norm": 0.7185905575752258, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 4980 + }, + { + "epoch": 4.187998321443558, + "grad_norm": 0.7566333413124084, + "learning_rate": 0.0002, + "loss": 1.3623, + "step": 4990 + }, + { + "epoch": 4.1963911036508605, + "grad_norm": 0.6960445642471313, + "learning_rate": 0.0002, + "loss": 1.2771, + "step": 5000 + }, + { + "epoch": 4.204783885858162, + "grad_norm": 0.7727336883544922, + "learning_rate": 0.0002, + "loss": 1.3565, + "step": 5010 + }, + { + "epoch": 4.213176668065464, + "grad_norm": 0.8038365244865417, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 5020 + }, + { + "epoch": 4.221569450272765, + "grad_norm": 0.7587628364562988, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5030 + }, + { + "epoch": 4.229962232480067, + "grad_norm": 0.928032398223877, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 5040 + }, + { + "epoch": 4.238355014687369, + "grad_norm": 0.7168642282485962, + "learning_rate": 0.0002, + "loss": 1.3768, + "step": 5050 + }, + { + "epoch": 4.24674779689467, + "grad_norm": 0.7981422543525696, + "learning_rate": 0.0002, + "loss": 1.3767, + "step": 5060 + }, + { + "epoch": 4.2551405791019725, + "grad_norm": 0.6951150894165039, + "learning_rate": 0.0002, + "loss": 1.406, + "step": 5070 + }, + { + "epoch": 4.263533361309274, + "grad_norm": 0.7337371706962585, + "learning_rate": 0.0002, + "loss": 1.3776, + "step": 5080 + }, + { + "epoch": 4.271926143516576, + "grad_norm": 0.8367464542388916, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 5090 + }, + { + "epoch": 4.280318925723877, + "grad_norm": 0.6744083166122437, + "learning_rate": 0.0002, + "loss": 1.3823, + "step": 5100 + }, + { + "epoch": 4.28871170793118, + "grad_norm": 0.9072301387786865, + "learning_rate": 0.0002, + "loss": 1.4183, + "step": 5110 + }, + { + "epoch": 4.297104490138481, + "grad_norm": 0.7703930735588074, + "learning_rate": 0.0002, + "loss": 1.4219, + "step": 5120 + }, + { + "epoch": 4.305497272345782, + "grad_norm": 0.6734083294868469, + "learning_rate": 0.0002, + "loss": 1.3658, + "step": 5130 + }, + { + "epoch": 4.3138900545530845, + "grad_norm": 0.7835540175437927, + "learning_rate": 0.0002, + "loss": 1.441, + "step": 5140 + }, + { + "epoch": 4.322282836760386, + "grad_norm": 1.0822200775146484, + "learning_rate": 0.0002, + "loss": 1.384, + "step": 5150 + }, + { + "epoch": 4.330675618967688, + "grad_norm": 0.8432536721229553, + "learning_rate": 0.0002, + "loss": 1.4167, + "step": 5160 + }, + { + "epoch": 4.339068401174989, + "grad_norm": 0.6739283800125122, + "learning_rate": 0.0002, + "loss": 1.3796, + "step": 5170 + }, + { + "epoch": 4.347461183382292, + "grad_norm": 0.7395278811454773, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 5180 + }, + { + "epoch": 4.355853965589593, + "grad_norm": 0.7638891339302063, + "learning_rate": 0.0002, + "loss": 1.3258, + "step": 5190 + }, + { + "epoch": 4.364246747796894, + "grad_norm": 1.1222662925720215, + "learning_rate": 0.0002, + "loss": 1.34, + "step": 5200 + }, + { + "epoch": 4.3726395300041965, + "grad_norm": 0.9102525115013123, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 5210 + }, + { + "epoch": 4.381032312211498, + "grad_norm": 0.7181593775749207, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 5220 + }, + { + "epoch": 4.3894250944188, + "grad_norm": 0.7813979387283325, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 5230 + }, + { + "epoch": 4.397817876626101, + "grad_norm": 0.8906185626983643, + "learning_rate": 0.0002, + "loss": 1.423, + "step": 5240 + }, + { + "epoch": 4.406210658833404, + "grad_norm": 0.7456443309783936, + "learning_rate": 0.0002, + "loss": 1.3901, + "step": 5250 + }, + { + "epoch": 4.414603441040705, + "grad_norm": 0.8752070069313049, + "learning_rate": 0.0002, + "loss": 1.3292, + "step": 5260 + }, + { + "epoch": 4.422996223248007, + "grad_norm": 0.9560954570770264, + "learning_rate": 0.0002, + "loss": 1.3351, + "step": 5270 + }, + { + "epoch": 4.4313890054553084, + "grad_norm": 0.7227762341499329, + "learning_rate": 0.0002, + "loss": 1.3708, + "step": 5280 + }, + { + "epoch": 4.43978178766261, + "grad_norm": 0.8141599893569946, + "learning_rate": 0.0002, + "loss": 1.4281, + "step": 5290 + }, + { + "epoch": 4.448174569869912, + "grad_norm": 0.928382158279419, + "learning_rate": 0.0002, + "loss": 1.381, + "step": 5300 + }, + { + "epoch": 4.456567352077213, + "grad_norm": 0.7719997763633728, + "learning_rate": 0.0002, + "loss": 1.3586, + "step": 5310 + }, + { + "epoch": 4.4649601342845155, + "grad_norm": 0.8081879615783691, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 5320 + }, + { + "epoch": 4.473352916491817, + "grad_norm": 0.7903412580490112, + "learning_rate": 0.0002, + "loss": 1.4121, + "step": 5330 + }, + { + "epoch": 4.481745698699119, + "grad_norm": 0.7751287221908569, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5340 + }, + { + "epoch": 4.49013848090642, + "grad_norm": 0.8287544250488281, + "learning_rate": 0.0002, + "loss": 1.392, + "step": 5350 + }, + { + "epoch": 4.498531263113723, + "grad_norm": 0.7431012392044067, + "learning_rate": 0.0002, + "loss": 1.3841, + "step": 5360 + }, + { + "epoch": 4.506924045321024, + "grad_norm": 0.8648661971092224, + "learning_rate": 0.0002, + "loss": 1.3843, + "step": 5370 + }, + { + "epoch": 4.515316827528325, + "grad_norm": 0.9314997792243958, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5380 + }, + { + "epoch": 4.5237096097356275, + "grad_norm": 0.7530864477157593, + "learning_rate": 0.0002, + "loss": 1.354, + "step": 5390 + }, + { + "epoch": 4.532102391942929, + "grad_norm": 0.8739821910858154, + "learning_rate": 0.0002, + "loss": 1.4159, + "step": 5400 + }, + { + "epoch": 4.540495174150231, + "grad_norm": 0.8090344667434692, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 4.548887956357532, + "grad_norm": 0.7530879974365234, + "learning_rate": 0.0002, + "loss": 1.4187, + "step": 5420 + }, + { + "epoch": 4.557280738564835, + "grad_norm": 0.8787251114845276, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 5430 + }, + { + "epoch": 4.565673520772136, + "grad_norm": 0.813961923122406, + "learning_rate": 0.0002, + "loss": 1.375, + "step": 5440 + }, + { + "epoch": 4.574066302979437, + "grad_norm": 0.7778232097625732, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 5450 + }, + { + "epoch": 4.5824590851867395, + "grad_norm": 0.7323020696640015, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5460 + }, + { + "epoch": 4.590851867394041, + "grad_norm": 0.7826765179634094, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5470 + }, + { + "epoch": 4.599244649601343, + "grad_norm": 0.7245969772338867, + "learning_rate": 0.0002, + "loss": 1.4068, + "step": 5480 + }, + { + "epoch": 4.607637431808644, + "grad_norm": 0.7697308659553528, + "learning_rate": 0.0002, + "loss": 1.4276, + "step": 5490 + }, + { + "epoch": 4.616030214015947, + "grad_norm": 0.8053571581840515, + "learning_rate": 0.0002, + "loss": 1.3849, + "step": 5500 + }, + { + "epoch": 4.624422996223248, + "grad_norm": 0.6728386282920837, + "learning_rate": 0.0002, + "loss": 1.4225, + "step": 5510 + }, + { + "epoch": 4.632815778430549, + "grad_norm": 0.7398585677146912, + "learning_rate": 0.0002, + "loss": 1.3771, + "step": 5520 + }, + { + "epoch": 4.6412085606378515, + "grad_norm": 0.7896319031715393, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 5530 + }, + { + "epoch": 4.649601342845153, + "grad_norm": 0.8290980458259583, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5540 + }, + { + "epoch": 4.657994125052455, + "grad_norm": 0.8232647776603699, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 5550 + }, + { + "epoch": 4.666386907259756, + "grad_norm": 0.9154987335205078, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5560 + }, + { + "epoch": 4.674779689467059, + "grad_norm": 0.8400886654853821, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 5570 + }, + { + "epoch": 4.68317247167436, + "grad_norm": 0.7312718629837036, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 5580 + }, + { + "epoch": 4.691565253881662, + "grad_norm": 0.8043803572654724, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 5590 + }, + { + "epoch": 4.6999580360889635, + "grad_norm": 0.7966225147247314, + "learning_rate": 0.0002, + "loss": 1.3952, + "step": 5600 + }, + { + "epoch": 4.708350818296266, + "grad_norm": 0.881574809551239, + "learning_rate": 0.0002, + "loss": 1.3429, + "step": 5610 + }, + { + "epoch": 4.716743600503567, + "grad_norm": 0.7252084016799927, + "learning_rate": 0.0002, + "loss": 1.4444, + "step": 5620 + }, + { + "epoch": 4.725136382710868, + "grad_norm": 0.7726518511772156, + "learning_rate": 0.0002, + "loss": 1.3566, + "step": 5630 + }, + { + "epoch": 4.7335291649181706, + "grad_norm": 0.7306379079818726, + "learning_rate": 0.0002, + "loss": 1.3954, + "step": 5640 + }, + { + "epoch": 4.741921947125472, + "grad_norm": 0.8029969334602356, + "learning_rate": 0.0002, + "loss": 1.4385, + "step": 5650 + }, + { + "epoch": 4.750314729332774, + "grad_norm": 0.9103893637657166, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 5660 + }, + { + "epoch": 4.758707511540075, + "grad_norm": 0.8783416748046875, + "learning_rate": 0.0002, + "loss": 1.4026, + "step": 5670 + }, + { + "epoch": 4.767100293747378, + "grad_norm": 0.6807119846343994, + "learning_rate": 0.0002, + "loss": 1.3427, + "step": 5680 + }, + { + "epoch": 4.775493075954679, + "grad_norm": 0.7103772759437561, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 5690 + }, + { + "epoch": 4.78388585816198, + "grad_norm": 0.8472093343734741, + "learning_rate": 0.0002, + "loss": 1.4079, + "step": 5700 + }, + { + "epoch": 4.7922786403692825, + "grad_norm": 0.851847231388092, + "learning_rate": 0.0002, + "loss": 1.3937, + "step": 5710 + }, + { + "epoch": 4.800671422576584, + "grad_norm": 0.9084636569023132, + "learning_rate": 0.0002, + "loss": 1.3965, + "step": 5720 + }, + { + "epoch": 4.809064204783886, + "grad_norm": 0.7628585696220398, + "learning_rate": 0.0002, + "loss": 1.4358, + "step": 5730 + }, + { + "epoch": 4.817456986991187, + "grad_norm": 0.775580883026123, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 5740 + }, + { + "epoch": 4.82584976919849, + "grad_norm": 0.7855771780014038, + "learning_rate": 0.0002, + "loss": 1.4573, + "step": 5750 + }, + { + "epoch": 4.834242551405791, + "grad_norm": 0.7021728754043579, + "learning_rate": 0.0002, + "loss": 1.3991, + "step": 5760 + }, + { + "epoch": 4.842635333613092, + "grad_norm": 0.7810541391372681, + "learning_rate": 0.0002, + "loss": 1.4012, + "step": 5770 + }, + { + "epoch": 4.8510281158203945, + "grad_norm": 0.7290041446685791, + "learning_rate": 0.0002, + "loss": 1.396, + "step": 5780 + }, + { + "epoch": 4.859420898027696, + "grad_norm": 0.9059709906578064, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 5790 + }, + { + "epoch": 4.867813680234998, + "grad_norm": 0.8338062167167664, + "learning_rate": 0.0002, + "loss": 1.4091, + "step": 5800 + }, + { + "epoch": 4.876206462442299, + "grad_norm": 0.830926775932312, + "learning_rate": 0.0002, + "loss": 1.395, + "step": 5810 + }, + { + "epoch": 4.884599244649602, + "grad_norm": 0.7818633317947388, + "learning_rate": 0.0002, + "loss": 1.4261, + "step": 5820 + }, + { + "epoch": 4.892992026856903, + "grad_norm": 0.8143376708030701, + "learning_rate": 0.0002, + "loss": 1.4252, + "step": 5830 + }, + { + "epoch": 4.901384809064205, + "grad_norm": 0.7754496335983276, + "learning_rate": 0.0002, + "loss": 1.3583, + "step": 5840 + }, + { + "epoch": 4.9097775912715065, + "grad_norm": 0.7154468297958374, + "learning_rate": 0.0002, + "loss": 1.4036, + "step": 5850 + }, + { + "epoch": 4.918170373478809, + "grad_norm": 0.6829783916473389, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 5860 + }, + { + "epoch": 4.92656315568611, + "grad_norm": 0.784919261932373, + "learning_rate": 0.0002, + "loss": 1.3854, + "step": 5870 + }, + { + "epoch": 4.934955937893411, + "grad_norm": 0.8168354034423828, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5880 + }, + { + "epoch": 4.943348720100714, + "grad_norm": 0.7356618642807007, + "learning_rate": 0.0002, + "loss": 1.3694, + "step": 5890 + }, + { + "epoch": 4.951741502308015, + "grad_norm": 0.7399224042892456, + "learning_rate": 0.0002, + "loss": 1.4827, + "step": 5900 + }, + { + "epoch": 4.960134284515317, + "grad_norm": 0.7430436015129089, + "learning_rate": 0.0002, + "loss": 1.3643, + "step": 5910 + }, + { + "epoch": 4.9685270667226185, + "grad_norm": 0.7587705850601196, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 5920 + }, + { + "epoch": 4.976919848929921, + "grad_norm": 0.9103638529777527, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5930 + }, + { + "epoch": 4.985312631137222, + "grad_norm": 0.7357394695281982, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5940 + }, + { + "epoch": 4.993705413344523, + "grad_norm": 0.7371547222137451, + "learning_rate": 0.0002, + "loss": 1.3988, + "step": 5950 + }, + { + "epoch": 4.9995803608896345, + "eval_loss": 1.9367210865020752, + "eval_runtime": 37.9833, + "eval_samples_per_second": 13.559, + "eval_steps_per_second": 1.711, + "step": 5957 + }, + { + "epoch": 5.0020981955518256, + "grad_norm": 0.7783351540565491, + "learning_rate": 0.0002, + "loss": 1.3876, + "step": 5960 + }, + { + "epoch": 5.010490977759127, + "grad_norm": 0.9268898367881775, + "learning_rate": 0.0002, + "loss": 1.2387, + "step": 5970 + }, + { + "epoch": 5.018883759966429, + "grad_norm": 0.9562761783599854, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 5980 + }, + { + "epoch": 5.02727654217373, + "grad_norm": 0.9391738176345825, + "learning_rate": 0.0002, + "loss": 1.205, + "step": 5990 + }, + { + "epoch": 5.035669324381033, + "grad_norm": 0.850326418876648, + "learning_rate": 0.0002, + "loss": 1.2112, + "step": 6000 + }, + { + "epoch": 5.044062106588334, + "grad_norm": 0.8442679643630981, + "learning_rate": 0.0002, + "loss": 1.2285, + "step": 6010 + }, + { + "epoch": 5.052454888795635, + "grad_norm": 1.2147290706634521, + "learning_rate": 0.0002, + "loss": 1.1677, + "step": 6020 + }, + { + "epoch": 5.0608476710029375, + "grad_norm": 0.9732922315597534, + "learning_rate": 0.0002, + "loss": 1.1836, + "step": 6030 + }, + { + "epoch": 5.069240453210239, + "grad_norm": 0.9354516267776489, + "learning_rate": 0.0002, + "loss": 1.215, + "step": 6040 + }, + { + "epoch": 5.077633235417541, + "grad_norm": 0.9681560397148132, + "learning_rate": 0.0002, + "loss": 1.1918, + "step": 6050 + }, + { + "epoch": 5.086026017624842, + "grad_norm": 0.9500439763069153, + "learning_rate": 0.0002, + "loss": 1.2146, + "step": 6060 + }, + { + "epoch": 5.094418799832145, + "grad_norm": 0.8693879246711731, + "learning_rate": 0.0002, + "loss": 1.1475, + "step": 6070 + }, + { + "epoch": 5.102811582039446, + "grad_norm": 1.1066458225250244, + "learning_rate": 0.0002, + "loss": 1.2181, + "step": 6080 + }, + { + "epoch": 5.111204364246748, + "grad_norm": 0.9530285000801086, + "learning_rate": 0.0002, + "loss": 1.2135, + "step": 6090 + }, + { + "epoch": 5.1195971464540495, + "grad_norm": 0.9323630928993225, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 6100 + }, + { + "epoch": 5.127989928661351, + "grad_norm": 0.9040294885635376, + "learning_rate": 0.0002, + "loss": 1.2434, + "step": 6110 + }, + { + "epoch": 5.136382710868653, + "grad_norm": 0.9981122612953186, + "learning_rate": 0.0002, + "loss": 1.2502, + "step": 6120 + }, + { + "epoch": 5.144775493075954, + "grad_norm": 0.9070921540260315, + "learning_rate": 0.0002, + "loss": 1.2648, + "step": 6130 + }, + { + "epoch": 5.153168275283257, + "grad_norm": 1.043802261352539, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 6140 + }, + { + "epoch": 5.161561057490558, + "grad_norm": 1.0889761447906494, + "learning_rate": 0.0002, + "loss": 1.1865, + "step": 6150 + }, + { + "epoch": 5.16995383969786, + "grad_norm": 0.9908999800682068, + "learning_rate": 0.0002, + "loss": 1.2498, + "step": 6160 + }, + { + "epoch": 5.1783466219051615, + "grad_norm": 1.099233865737915, + "learning_rate": 0.0002, + "loss": 1.2981, + "step": 6170 + }, + { + "epoch": 5.186739404112464, + "grad_norm": 0.9536478519439697, + "learning_rate": 0.0002, + "loss": 1.2236, + "step": 6180 + }, + { + "epoch": 5.195132186319765, + "grad_norm": 0.8672952055931091, + "learning_rate": 0.0002, + "loss": 1.1889, + "step": 6190 + }, + { + "epoch": 5.203524968527066, + "grad_norm": 1.0116329193115234, + "learning_rate": 0.0002, + "loss": 1.2142, + "step": 6200 + }, + { + "epoch": 5.211917750734369, + "grad_norm": 0.9327153563499451, + "learning_rate": 0.0002, + "loss": 1.1813, + "step": 6210 + }, + { + "epoch": 5.22031053294167, + "grad_norm": 0.85637366771698, + "learning_rate": 0.0002, + "loss": 1.2372, + "step": 6220 + }, + { + "epoch": 5.228703315148972, + "grad_norm": 1.0490736961364746, + "learning_rate": 0.0002, + "loss": 1.2949, + "step": 6230 + }, + { + "epoch": 5.2370960973562735, + "grad_norm": 0.8849565982818604, + "learning_rate": 0.0002, + "loss": 1.1604, + "step": 6240 + }, + { + "epoch": 5.245488879563576, + "grad_norm": 0.8852671980857849, + "learning_rate": 0.0002, + "loss": 1.2257, + "step": 6250 + }, + { + "epoch": 5.253881661770877, + "grad_norm": 0.9146860241889954, + "learning_rate": 0.0002, + "loss": 1.275, + "step": 6260 + }, + { + "epoch": 5.262274443978178, + "grad_norm": 1.0188325643539429, + "learning_rate": 0.0002, + "loss": 1.2543, + "step": 6270 + }, + { + "epoch": 5.270667226185481, + "grad_norm": 1.0053156614303589, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 6280 + }, + { + "epoch": 5.279060008392782, + "grad_norm": 0.9962273836135864, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 6290 + }, + { + "epoch": 5.287452790600084, + "grad_norm": 1.000300645828247, + "learning_rate": 0.0002, + "loss": 1.2487, + "step": 6300 + }, + { + "epoch": 5.295845572807385, + "grad_norm": 0.9821932911872864, + "learning_rate": 0.0002, + "loss": 1.3214, + "step": 6310 + }, + { + "epoch": 5.304238355014688, + "grad_norm": 1.0103896856307983, + "learning_rate": 0.0002, + "loss": 1.2964, + "step": 6320 + }, + { + "epoch": 5.312631137221989, + "grad_norm": 0.9323601722717285, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 6330 + }, + { + "epoch": 5.321023919429291, + "grad_norm": 1.0668879747390747, + "learning_rate": 0.0002, + "loss": 1.3165, + "step": 6340 + }, + { + "epoch": 5.3294167016365925, + "grad_norm": 0.9666323065757751, + "learning_rate": 0.0002, + "loss": 1.2411, + "step": 6350 + }, + { + "epoch": 5.337809483843894, + "grad_norm": 0.9439574480056763, + "learning_rate": 0.0002, + "loss": 1.2129, + "step": 6360 + }, + { + "epoch": 5.346202266051196, + "grad_norm": 1.0229361057281494, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 6370 + }, + { + "epoch": 5.354595048258497, + "grad_norm": 0.8522404432296753, + "learning_rate": 0.0002, + "loss": 1.2021, + "step": 6380 + }, + { + "epoch": 5.3629878304658, + "grad_norm": 1.3732287883758545, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 6390 + }, + { + "epoch": 5.371380612673101, + "grad_norm": 0.8201091885566711, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 6400 + }, + { + "epoch": 5.379773394880403, + "grad_norm": 0.8874436616897583, + "learning_rate": 0.0002, + "loss": 1.2867, + "step": 6410 + }, + { + "epoch": 5.3881661770877045, + "grad_norm": 1.0118640661239624, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 6420 + }, + { + "epoch": 5.396558959295007, + "grad_norm": 1.0468370914459229, + "learning_rate": 0.0002, + "loss": 1.2952, + "step": 6430 + }, + { + "epoch": 5.404951741502308, + "grad_norm": 0.941806972026825, + "learning_rate": 0.0002, + "loss": 1.2057, + "step": 6440 + }, + { + "epoch": 5.413344523709609, + "grad_norm": 0.9860424399375916, + "learning_rate": 0.0002, + "loss": 1.3289, + "step": 6450 + }, + { + "epoch": 5.421737305916912, + "grad_norm": 1.009628176689148, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 6460 + }, + { + "epoch": 5.430130088124213, + "grad_norm": 0.9842159748077393, + "learning_rate": 0.0002, + "loss": 1.2544, + "step": 6470 + }, + { + "epoch": 5.438522870331515, + "grad_norm": 0.9935571551322937, + "learning_rate": 0.0002, + "loss": 1.2277, + "step": 6480 + }, + { + "epoch": 5.4469156525388165, + "grad_norm": 0.8872362971305847, + "learning_rate": 0.0002, + "loss": 1.2392, + "step": 6490 + }, + { + "epoch": 5.455308434746119, + "grad_norm": 0.9530836939811707, + "learning_rate": 0.0002, + "loss": 1.2166, + "step": 6500 + }, + { + "epoch": 5.46370121695342, + "grad_norm": 0.8111279010772705, + "learning_rate": 0.0002, + "loss": 1.2138, + "step": 6510 + }, + { + "epoch": 5.472093999160721, + "grad_norm": 1.0474516153335571, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 6520 + }, + { + "epoch": 5.480486781368024, + "grad_norm": 1.0228482484817505, + "learning_rate": 0.0002, + "loss": 1.2752, + "step": 6530 + }, + { + "epoch": 5.488879563575325, + "grad_norm": 1.0299347639083862, + "learning_rate": 0.0002, + "loss": 1.2739, + "step": 6540 + }, + { + "epoch": 5.497272345782627, + "grad_norm": 0.9105098247528076, + "learning_rate": 0.0002, + "loss": 1.3163, + "step": 6550 + }, + { + "epoch": 5.5056651279899285, + "grad_norm": 1.2459523677825928, + "learning_rate": 0.0002, + "loss": 1.2718, + "step": 6560 + }, + { + "epoch": 5.514057910197231, + "grad_norm": 1.0630481243133545, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 6570 + }, + { + "epoch": 5.522450692404532, + "grad_norm": 0.8310980796813965, + "learning_rate": 0.0002, + "loss": 1.3003, + "step": 6580 + }, + { + "epoch": 5.530843474611833, + "grad_norm": 1.102723479270935, + "learning_rate": 0.0002, + "loss": 1.1855, + "step": 6590 + }, + { + "epoch": 5.539236256819136, + "grad_norm": 0.9586807489395142, + "learning_rate": 0.0002, + "loss": 1.2889, + "step": 6600 + }, + { + "epoch": 5.547629039026437, + "grad_norm": 0.976191520690918, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 6610 + }, + { + "epoch": 5.556021821233739, + "grad_norm": 0.9943762421607971, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 6620 + }, + { + "epoch": 5.56441460344104, + "grad_norm": 0.8788089156150818, + "learning_rate": 0.0002, + "loss": 1.3103, + "step": 6630 + }, + { + "epoch": 5.572807385648343, + "grad_norm": 0.9866173267364502, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6640 + }, + { + "epoch": 5.581200167855644, + "grad_norm": 1.0791642665863037, + "learning_rate": 0.0002, + "loss": 1.2686, + "step": 6650 + }, + { + "epoch": 5.589592950062946, + "grad_norm": 0.836482584476471, + "learning_rate": 0.0002, + "loss": 1.2806, + "step": 6660 + }, + { + "epoch": 5.5979857322702475, + "grad_norm": 0.9841130971908569, + "learning_rate": 0.0002, + "loss": 1.3114, + "step": 6670 + }, + { + "epoch": 5.60637851447755, + "grad_norm": 0.9678813815116882, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 6680 + }, + { + "epoch": 5.614771296684851, + "grad_norm": 0.9033233523368835, + "learning_rate": 0.0002, + "loss": 1.1969, + "step": 6690 + }, + { + "epoch": 5.623164078892152, + "grad_norm": 0.8691515922546387, + "learning_rate": 0.0002, + "loss": 1.2565, + "step": 6700 + }, + { + "epoch": 5.631556861099455, + "grad_norm": 0.8971360921859741, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 6710 + }, + { + "epoch": 5.639949643306756, + "grad_norm": 0.9377756118774414, + "learning_rate": 0.0002, + "loss": 1.2266, + "step": 6720 + }, + { + "epoch": 5.648342425514058, + "grad_norm": 0.908762514591217, + "learning_rate": 0.0002, + "loss": 1.28, + "step": 6730 + }, + { + "epoch": 5.6567352077213595, + "grad_norm": 1.0503337383270264, + "learning_rate": 0.0002, + "loss": 1.2499, + "step": 6740 + }, + { + "epoch": 5.665127989928662, + "grad_norm": 1.030267357826233, + "learning_rate": 0.0002, + "loss": 1.3604, + "step": 6750 + }, + { + "epoch": 5.673520772135963, + "grad_norm": 0.9150485992431641, + "learning_rate": 0.0002, + "loss": 1.2223, + "step": 6760 + }, + { + "epoch": 5.681913554343264, + "grad_norm": 1.0300343036651611, + "learning_rate": 0.0002, + "loss": 1.2651, + "step": 6770 + }, + { + "epoch": 5.690306336550567, + "grad_norm": 1.1242924928665161, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 6780 + }, + { + "epoch": 5.698699118757868, + "grad_norm": 0.9489498138427734, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6790 + }, + { + "epoch": 5.70709190096517, + "grad_norm": 0.8829707503318787, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 6800 + }, + { + "epoch": 5.7154846831724715, + "grad_norm": 1.01392662525177, + "learning_rate": 0.0002, + "loss": 1.2765, + "step": 6810 + }, + { + "epoch": 5.723877465379774, + "grad_norm": 0.9234510064125061, + "learning_rate": 0.0002, + "loss": 1.3029, + "step": 6820 + }, + { + "epoch": 5.732270247587075, + "grad_norm": 0.9439187049865723, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 6830 + }, + { + "epoch": 5.740663029794376, + "grad_norm": 0.8833441734313965, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6840 + }, + { + "epoch": 5.749055812001679, + "grad_norm": 0.9394439458847046, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 6850 + }, + { + "epoch": 5.75744859420898, + "grad_norm": 0.9980010390281677, + "learning_rate": 0.0002, + "loss": 1.3108, + "step": 6860 + }, + { + "epoch": 5.765841376416282, + "grad_norm": 0.9612377882003784, + "learning_rate": 0.0002, + "loss": 1.2958, + "step": 6870 + }, + { + "epoch": 5.7742341586235835, + "grad_norm": 1.0817323923110962, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 6880 + }, + { + "epoch": 5.782626940830886, + "grad_norm": 0.8445103168487549, + "learning_rate": 0.0002, + "loss": 1.2485, + "step": 6890 + }, + { + "epoch": 5.791019723038187, + "grad_norm": 0.8535459041595459, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 6900 + }, + { + "epoch": 5.799412505245489, + "grad_norm": 0.9131284356117249, + "learning_rate": 0.0002, + "loss": 1.2729, + "step": 6910 + }, + { + "epoch": 5.807805287452791, + "grad_norm": 0.8627726435661316, + "learning_rate": 0.0002, + "loss": 1.1934, + "step": 6920 + }, + { + "epoch": 5.816198069660093, + "grad_norm": 0.8599951863288879, + "learning_rate": 0.0002, + "loss": 1.3226, + "step": 6930 + }, + { + "epoch": 5.824590851867394, + "grad_norm": 1.0746861696243286, + "learning_rate": 0.0002, + "loss": 1.3078, + "step": 6940 + }, + { + "epoch": 5.8329836340746954, + "grad_norm": 1.0220543146133423, + "learning_rate": 0.0002, + "loss": 1.2653, + "step": 6950 + }, + { + "epoch": 5.841376416281998, + "grad_norm": 0.8891388177871704, + "learning_rate": 0.0002, + "loss": 1.3168, + "step": 6960 + }, + { + "epoch": 5.849769198489299, + "grad_norm": 1.1404683589935303, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 6970 + }, + { + "epoch": 5.858161980696601, + "grad_norm": 0.9665380120277405, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 6980 + }, + { + "epoch": 5.8665547629039025, + "grad_norm": 0.9837968945503235, + "learning_rate": 0.0002, + "loss": 1.2622, + "step": 6990 + }, + { + "epoch": 5.874947545111205, + "grad_norm": 1.0278598070144653, + "learning_rate": 0.0002, + "loss": 1.2973, + "step": 7000 + }, + { + "epoch": 5.883340327318506, + "grad_norm": 0.9990253448486328, + "learning_rate": 0.0002, + "loss": 1.2334, + "step": 7010 + }, + { + "epoch": 5.891733109525807, + "grad_norm": 0.9705647230148315, + "learning_rate": 0.0002, + "loss": 1.3508, + "step": 7020 + }, + { + "epoch": 5.90012589173311, + "grad_norm": 0.9672252535820007, + "learning_rate": 0.0002, + "loss": 1.335, + "step": 7030 + }, + { + "epoch": 5.908518673940411, + "grad_norm": 0.9467034339904785, + "learning_rate": 0.0002, + "loss": 1.2944, + "step": 7040 + }, + { + "epoch": 5.916911456147713, + "grad_norm": 0.9506469964981079, + "learning_rate": 0.0002, + "loss": 1.2704, + "step": 7050 + }, + { + "epoch": 5.9253042383550145, + "grad_norm": 0.8936163783073425, + "learning_rate": 0.0002, + "loss": 1.2745, + "step": 7060 + }, + { + "epoch": 5.933697020562317, + "grad_norm": 0.956101655960083, + "learning_rate": 0.0002, + "loss": 1.2702, + "step": 7070 + }, + { + "epoch": 5.942089802769618, + "grad_norm": 0.893535852432251, + "learning_rate": 0.0002, + "loss": 1.2532, + "step": 7080 + }, + { + "epoch": 5.950482584976919, + "grad_norm": 1.0313799381256104, + "learning_rate": 0.0002, + "loss": 1.342, + "step": 7090 + }, + { + "epoch": 5.958875367184222, + "grad_norm": 0.8567915558815002, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 7100 + }, + { + "epoch": 5.967268149391523, + "grad_norm": 0.9683501720428467, + "learning_rate": 0.0002, + "loss": 1.3127, + "step": 7110 + }, + { + "epoch": 5.975660931598825, + "grad_norm": 0.9401984214782715, + "learning_rate": 0.0002, + "loss": 1.2522, + "step": 7120 + }, + { + "epoch": 5.9840537138061265, + "grad_norm": 1.0316764116287231, + "learning_rate": 0.0002, + "loss": 1.3211, + "step": 7130 + }, + { + "epoch": 5.992446496013429, + "grad_norm": 0.9335392713546753, + "learning_rate": 0.0002, + "loss": 1.2445, + "step": 7140 + }, + { + "epoch": 6.0, + "eval_loss": 2.041194438934326, + "eval_runtime": 37.9642, + "eval_samples_per_second": 13.565, + "eval_steps_per_second": 1.712, + "step": 7149 + }, + { + "epoch": 6.00083927822073, + "grad_norm": 1.0247591733932495, + "learning_rate": 0.0002, + "loss": 1.2531, + "step": 7150 + }, + { + "epoch": 6.009232060428032, + "grad_norm": 1.4086190462112427, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 7160 + }, + { + "epoch": 6.017624842635334, + "grad_norm": 1.0636897087097168, + "learning_rate": 0.0002, + "loss": 1.0702, + "step": 7170 + }, + { + "epoch": 6.026017624842635, + "grad_norm": 1.1334257125854492, + "learning_rate": 0.0002, + "loss": 1.118, + "step": 7180 + }, + { + "epoch": 6.034410407049937, + "grad_norm": 1.1142425537109375, + "learning_rate": 0.0002, + "loss": 1.0428, + "step": 7190 + }, + { + "epoch": 6.0428031892572385, + "grad_norm": 1.1448479890823364, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 7200 + }, + { + "epoch": 6.051195971464541, + "grad_norm": 1.181567907333374, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 7210 + }, + { + "epoch": 6.059588753671842, + "grad_norm": 1.0471529960632324, + "learning_rate": 0.0002, + "loss": 1.0435, + "step": 7220 + }, + { + "epoch": 6.067981535879144, + "grad_norm": 1.1432698965072632, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 7230 + }, + { + "epoch": 6.076374318086446, + "grad_norm": 1.1316763162612915, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 7240 + }, + { + "epoch": 6.084767100293748, + "grad_norm": 0.9800271391868591, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 7250 + }, + { + "epoch": 6.093159882501049, + "grad_norm": 1.1878576278686523, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 7260 + }, + { + "epoch": 6.1015526647083504, + "grad_norm": 1.0174267292022705, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 7270 + }, + { + "epoch": 6.109945446915653, + "grad_norm": 0.9622059464454651, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 7280 + }, + { + "epoch": 6.118338229122954, + "grad_norm": 1.3247325420379639, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 7290 + }, + { + "epoch": 6.126731011330256, + "grad_norm": 1.2405189275741577, + "learning_rate": 0.0002, + "loss": 1.0533, + "step": 7300 + }, + { + "epoch": 6.1351237935375575, + "grad_norm": 1.025123953819275, + "learning_rate": 0.0002, + "loss": 1.1345, + "step": 7310 + }, + { + "epoch": 6.14351657574486, + "grad_norm": 1.2966125011444092, + "learning_rate": 0.0002, + "loss": 1.0879, + "step": 7320 + }, + { + "epoch": 6.151909357952161, + "grad_norm": 1.0655252933502197, + "learning_rate": 0.0002, + "loss": 1.106, + "step": 7330 + }, + { + "epoch": 6.160302140159462, + "grad_norm": 1.076251745223999, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 7340 + }, + { + "epoch": 6.168694922366765, + "grad_norm": 1.0632140636444092, + "learning_rate": 0.0002, + "loss": 1.1144, + "step": 7350 + }, + { + "epoch": 6.177087704574066, + "grad_norm": 1.392654538154602, + "learning_rate": 0.0002, + "loss": 1.1284, + "step": 7360 + }, + { + "epoch": 6.185480486781368, + "grad_norm": 1.071683645248413, + "learning_rate": 0.0002, + "loss": 1.0909, + "step": 7370 + }, + { + "epoch": 6.1938732689886695, + "grad_norm": 1.0602295398712158, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 7380 + }, + { + "epoch": 6.202266051195972, + "grad_norm": 1.2152365446090698, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 7390 + }, + { + "epoch": 6.210658833403273, + "grad_norm": 1.1637049913406372, + "learning_rate": 0.0002, + "loss": 1.0622, + "step": 7400 + }, + { + "epoch": 6.219051615610575, + "grad_norm": 1.3976062536239624, + "learning_rate": 0.0002, + "loss": 1.1107, + "step": 7410 + }, + { + "epoch": 6.227444397817877, + "grad_norm": 1.1892462968826294, + "learning_rate": 0.0002, + "loss": 1.084, + "step": 7420 + }, + { + "epoch": 6.235837180025178, + "grad_norm": 1.23629629611969, + "learning_rate": 0.0002, + "loss": 1.0517, + "step": 7430 + }, + { + "epoch": 6.24422996223248, + "grad_norm": 1.2072324752807617, + "learning_rate": 0.0002, + "loss": 1.1069, + "step": 7440 + }, + { + "epoch": 6.2526227444397815, + "grad_norm": 1.2027140855789185, + "learning_rate": 0.0002, + "loss": 1.172, + "step": 7450 + }, + { + "epoch": 6.261015526647084, + "grad_norm": 1.2129466533660889, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 7460 + }, + { + "epoch": 6.269408308854385, + "grad_norm": 1.1675773859024048, + "learning_rate": 0.0002, + "loss": 1.1493, + "step": 7470 + }, + { + "epoch": 6.277801091061687, + "grad_norm": 1.189106822013855, + "learning_rate": 0.0002, + "loss": 1.0884, + "step": 7480 + }, + { + "epoch": 6.286193873268989, + "grad_norm": 0.9968156218528748, + "learning_rate": 0.0002, + "loss": 1.1557, + "step": 7490 + }, + { + "epoch": 6.294586655476291, + "grad_norm": 1.2140403985977173, + "learning_rate": 0.0002, + "loss": 1.1816, + "step": 7500 + }, + { + "epoch": 6.302979437683592, + "grad_norm": 1.1790717840194702, + "learning_rate": 0.0002, + "loss": 1.1163, + "step": 7510 + }, + { + "epoch": 6.3113722198908935, + "grad_norm": 1.1867438554763794, + "learning_rate": 0.0002, + "loss": 1.114, + "step": 7520 + }, + { + "epoch": 6.319765002098196, + "grad_norm": 1.2212399244308472, + "learning_rate": 0.0002, + "loss": 1.1697, + "step": 7530 + }, + { + "epoch": 6.328157784305497, + "grad_norm": 1.1840152740478516, + "learning_rate": 0.0002, + "loss": 1.1103, + "step": 7540 + }, + { + "epoch": 6.336550566512799, + "grad_norm": 1.1392520666122437, + "learning_rate": 0.0002, + "loss": 1.015, + "step": 7550 + }, + { + "epoch": 6.344943348720101, + "grad_norm": 1.2683428525924683, + "learning_rate": 0.0002, + "loss": 1.1686, + "step": 7560 + }, + { + "epoch": 6.353336130927403, + "grad_norm": 1.2927075624465942, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 7570 + }, + { + "epoch": 6.361728913134704, + "grad_norm": 1.1633557081222534, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 7580 + }, + { + "epoch": 6.3701216953420055, + "grad_norm": 1.2839789390563965, + "learning_rate": 0.0002, + "loss": 1.0448, + "step": 7590 + }, + { + "epoch": 6.378514477549308, + "grad_norm": 1.1563365459442139, + "learning_rate": 0.0002, + "loss": 1.0679, + "step": 7600 + }, + { + "epoch": 6.386907259756609, + "grad_norm": 1.3075823783874512, + "learning_rate": 0.0002, + "loss": 1.1222, + "step": 7610 + }, + { + "epoch": 6.395300041963911, + "grad_norm": 1.1148593425750732, + "learning_rate": 0.0002, + "loss": 1.1872, + "step": 7620 + }, + { + "epoch": 6.4036928241712125, + "grad_norm": 1.3017758131027222, + "learning_rate": 0.0002, + "loss": 1.1296, + "step": 7630 + }, + { + "epoch": 6.412085606378515, + "grad_norm": 1.3302847146987915, + "learning_rate": 0.0002, + "loss": 1.0982, + "step": 7640 + }, + { + "epoch": 6.420478388585816, + "grad_norm": 1.3263767957687378, + "learning_rate": 0.0002, + "loss": 1.1228, + "step": 7650 + }, + { + "epoch": 6.428871170793118, + "grad_norm": 1.2079416513442993, + "learning_rate": 0.0002, + "loss": 1.1036, + "step": 7660 + }, + { + "epoch": 6.43726395300042, + "grad_norm": 1.1282644271850586, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 7670 + }, + { + "epoch": 6.445656735207721, + "grad_norm": 1.1894482374191284, + "learning_rate": 0.0002, + "loss": 1.1437, + "step": 7680 + }, + { + "epoch": 6.454049517415023, + "grad_norm": 1.2007642984390259, + "learning_rate": 0.0002, + "loss": 1.1531, + "step": 7690 + }, + { + "epoch": 6.4624422996223245, + "grad_norm": 1.3172780275344849, + "learning_rate": 0.0002, + "loss": 1.1639, + "step": 7700 + }, + { + "epoch": 6.470835081829627, + "grad_norm": 1.113945722579956, + "learning_rate": 0.0002, + "loss": 1.1477, + "step": 7710 + }, + { + "epoch": 6.479227864036928, + "grad_norm": 1.1763832569122314, + "learning_rate": 0.0002, + "loss": 1.0852, + "step": 7720 + }, + { + "epoch": 6.48762064624423, + "grad_norm": 1.196928858757019, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 7730 + }, + { + "epoch": 6.496013428451532, + "grad_norm": 1.2109456062316895, + "learning_rate": 0.0002, + "loss": 1.1736, + "step": 7740 + }, + { + "epoch": 6.504406210658834, + "grad_norm": 1.3580254316329956, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 7750 + }, + { + "epoch": 6.512798992866135, + "grad_norm": 1.0432099103927612, + "learning_rate": 0.0002, + "loss": 1.0606, + "step": 7760 + }, + { + "epoch": 6.5211917750734365, + "grad_norm": 1.0125840902328491, + "learning_rate": 0.0002, + "loss": 1.1453, + "step": 7770 + }, + { + "epoch": 6.529584557280739, + "grad_norm": 1.5847094058990479, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 7780 + }, + { + "epoch": 6.53797733948804, + "grad_norm": 1.161391258239746, + "learning_rate": 0.0002, + "loss": 1.0885, + "step": 7790 + }, + { + "epoch": 6.546370121695342, + "grad_norm": 1.1106663942337036, + "learning_rate": 0.0002, + "loss": 1.1549, + "step": 7800 + }, + { + "epoch": 6.554762903902644, + "grad_norm": 1.2467689514160156, + "learning_rate": 0.0002, + "loss": 1.0584, + "step": 7810 + }, + { + "epoch": 6.563155686109946, + "grad_norm": 1.1907767057418823, + "learning_rate": 0.0002, + "loss": 1.0923, + "step": 7820 + }, + { + "epoch": 6.571548468317247, + "grad_norm": 1.1521105766296387, + "learning_rate": 0.0002, + "loss": 1.1606, + "step": 7830 + }, + { + "epoch": 6.5799412505245485, + "grad_norm": 1.2498128414154053, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 7840 + }, + { + "epoch": 6.588334032731851, + "grad_norm": 1.1506036520004272, + "learning_rate": 0.0002, + "loss": 1.0948, + "step": 7850 + }, + { + "epoch": 6.596726814939152, + "grad_norm": 1.118890404701233, + "learning_rate": 0.0002, + "loss": 1.1499, + "step": 7860 + }, + { + "epoch": 6.605119597146454, + "grad_norm": 1.1001442670822144, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 7870 + }, + { + "epoch": 6.613512379353756, + "grad_norm": 1.1551518440246582, + "learning_rate": 0.0002, + "loss": 1.1139, + "step": 7880 + }, + { + "epoch": 6.621905161561058, + "grad_norm": 1.1872174739837646, + "learning_rate": 0.0002, + "loss": 1.1255, + "step": 7890 + }, + { + "epoch": 6.630297943768359, + "grad_norm": 1.1665245294570923, + "learning_rate": 0.0002, + "loss": 1.1013, + "step": 7900 + }, + { + "epoch": 6.6386907259756605, + "grad_norm": 1.1592308282852173, + "learning_rate": 0.0002, + "loss": 1.1857, + "step": 7910 + }, + { + "epoch": 6.647083508182963, + "grad_norm": 1.2712409496307373, + "learning_rate": 0.0002, + "loss": 1.1639, + "step": 7920 + }, + { + "epoch": 6.655476290390264, + "grad_norm": 1.0665934085845947, + "learning_rate": 0.0002, + "loss": 1.147, + "step": 7930 + }, + { + "epoch": 6.663869072597566, + "grad_norm": 1.1843419075012207, + "learning_rate": 0.0002, + "loss": 1.1437, + "step": 7940 + }, + { + "epoch": 6.6722618548048676, + "grad_norm": 1.4945712089538574, + "learning_rate": 0.0002, + "loss": 1.1359, + "step": 7950 + }, + { + "epoch": 6.68065463701217, + "grad_norm": 1.3284149169921875, + "learning_rate": 0.0002, + "loss": 1.1772, + "step": 7960 + }, + { + "epoch": 6.689047419219471, + "grad_norm": 1.1670401096343994, + "learning_rate": 0.0002, + "loss": 1.1183, + "step": 7970 + }, + { + "epoch": 6.697440201426773, + "grad_norm": 1.1963475942611694, + "learning_rate": 0.0002, + "loss": 1.1808, + "step": 7980 + }, + { + "epoch": 6.705832983634075, + "grad_norm": 1.077380657196045, + "learning_rate": 0.0002, + "loss": 1.1489, + "step": 7990 + }, + { + "epoch": 6.714225765841377, + "grad_norm": 0.8758405447006226, + "learning_rate": 0.0002, + "loss": 1.1661, + "step": 8000 + }, + { + "epoch": 6.722618548048678, + "grad_norm": 1.2686632871627808, + "learning_rate": 0.0002, + "loss": 1.169, + "step": 8010 + }, + { + "epoch": 6.7310113302559795, + "grad_norm": 1.1136665344238281, + "learning_rate": 0.0002, + "loss": 1.1486, + "step": 8020 + }, + { + "epoch": 6.739404112463282, + "grad_norm": 1.25029456615448, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 8030 + }, + { + "epoch": 6.747796894670583, + "grad_norm": 1.0269629955291748, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 8040 + }, + { + "epoch": 6.756189676877885, + "grad_norm": 1.1515758037567139, + "learning_rate": 0.0002, + "loss": 1.1707, + "step": 8050 + }, + { + "epoch": 6.764582459085187, + "grad_norm": 1.1150308847427368, + "learning_rate": 0.0002, + "loss": 1.1487, + "step": 8060 + }, + { + "epoch": 6.772975241292489, + "grad_norm": 1.025669813156128, + "learning_rate": 0.0002, + "loss": 1.088, + "step": 8070 + }, + { + "epoch": 6.78136802349979, + "grad_norm": 1.0564825534820557, + "learning_rate": 0.0002, + "loss": 1.1002, + "step": 8080 + }, + { + "epoch": 6.7897608057070915, + "grad_norm": 1.1695157289505005, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 8090 + }, + { + "epoch": 6.798153587914394, + "grad_norm": 1.1086713075637817, + "learning_rate": 0.0002, + "loss": 1.1322, + "step": 8100 + }, + { + "epoch": 6.806546370121695, + "grad_norm": 1.0446662902832031, + "learning_rate": 0.0002, + "loss": 1.2036, + "step": 8110 + }, + { + "epoch": 6.814939152328997, + "grad_norm": 1.2017868757247925, + "learning_rate": 0.0002, + "loss": 1.1106, + "step": 8120 + }, + { + "epoch": 6.823331934536299, + "grad_norm": 1.2538378238677979, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 8130 + }, + { + "epoch": 6.831724716743601, + "grad_norm": 1.1552783250808716, + "learning_rate": 0.0002, + "loss": 1.1506, + "step": 8140 + }, + { + "epoch": 6.840117498950902, + "grad_norm": 1.2151418924331665, + "learning_rate": 0.0002, + "loss": 1.1623, + "step": 8150 + }, + { + "epoch": 6.8485102811582035, + "grad_norm": 1.1431301832199097, + "learning_rate": 0.0002, + "loss": 1.121, + "step": 8160 + }, + { + "epoch": 6.856903063365506, + "grad_norm": 1.0864715576171875, + "learning_rate": 0.0002, + "loss": 1.1312, + "step": 8170 + }, + { + "epoch": 6.865295845572807, + "grad_norm": 1.2602605819702148, + "learning_rate": 0.0002, + "loss": 1.1777, + "step": 8180 + }, + { + "epoch": 6.873688627780109, + "grad_norm": 1.1670788526535034, + "learning_rate": 0.0002, + "loss": 1.1237, + "step": 8190 + }, + { + "epoch": 6.882081409987411, + "grad_norm": 1.1444851160049438, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 8200 + }, + { + "epoch": 6.890474192194713, + "grad_norm": 1.1726973056793213, + "learning_rate": 0.0002, + "loss": 1.1208, + "step": 8210 + }, + { + "epoch": 6.898866974402014, + "grad_norm": 1.0436229705810547, + "learning_rate": 0.0002, + "loss": 1.1666, + "step": 8220 + }, + { + "epoch": 6.907259756609316, + "grad_norm": 1.3296568393707275, + "learning_rate": 0.0002, + "loss": 1.097, + "step": 8230 + }, + { + "epoch": 6.915652538816618, + "grad_norm": 1.2561821937561035, + "learning_rate": 0.0002, + "loss": 1.0581, + "step": 8240 + }, + { + "epoch": 6.92404532102392, + "grad_norm": 1.2071776390075684, + "learning_rate": 0.0002, + "loss": 1.2125, + "step": 8250 + }, + { + "epoch": 6.932438103231221, + "grad_norm": 1.115523099899292, + "learning_rate": 0.0002, + "loss": 1.1433, + "step": 8260 + }, + { + "epoch": 6.940830885438523, + "grad_norm": 1.145468831062317, + "learning_rate": 0.0002, + "loss": 1.2104, + "step": 8270 + }, + { + "epoch": 6.949223667645825, + "grad_norm": 1.2517759799957275, + "learning_rate": 0.0002, + "loss": 1.1654, + "step": 8280 + }, + { + "epoch": 6.957616449853126, + "grad_norm": 1.1757365465164185, + "learning_rate": 0.0002, + "loss": 1.0968, + "step": 8290 + }, + { + "epoch": 6.966009232060428, + "grad_norm": 1.0645636320114136, + "learning_rate": 0.0002, + "loss": 1.1899, + "step": 8300 + }, + { + "epoch": 6.97440201426773, + "grad_norm": 1.2390278577804565, + "learning_rate": 0.0002, + "loss": 1.2665, + "step": 8310 + }, + { + "epoch": 6.982794796475032, + "grad_norm": 1.202418327331543, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 8320 + }, + { + "epoch": 6.991187578682333, + "grad_norm": 1.0840344429016113, + "learning_rate": 0.0002, + "loss": 1.1722, + "step": 8330 + }, + { + "epoch": 6.9995803608896345, + "grad_norm": 1.2504760026931763, + "learning_rate": 0.0002, + "loss": 1.1172, + "step": 8340 + }, + { + "epoch": 6.9995803608896345, + "eval_loss": 2.1729838848114014, + "eval_runtime": 37.9703, + "eval_samples_per_second": 13.563, + "eval_steps_per_second": 1.712, + "step": 8340 + }, + { + "epoch": 7.007973143096937, + "grad_norm": 1.3072566986083984, + "learning_rate": 0.0002, + "loss": 0.9518, + "step": 8350 + }, + { + "epoch": 7.016365925304238, + "grad_norm": 1.4257196187973022, + "learning_rate": 0.0002, + "loss": 0.9095, + "step": 8360 + }, + { + "epoch": 7.02475870751154, + "grad_norm": 1.2966243028640747, + "learning_rate": 0.0002, + "loss": 0.96, + "step": 8370 + }, + { + "epoch": 7.033151489718842, + "grad_norm": 1.3083164691925049, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 8380 + }, + { + "epoch": 7.041544271926144, + "grad_norm": 1.2210543155670166, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 8390 + }, + { + "epoch": 7.049937054133445, + "grad_norm": 1.1458159685134888, + "learning_rate": 0.0002, + "loss": 0.9794, + "step": 8400 + }, + { + "epoch": 7.0583298363407465, + "grad_norm": 1.4605761766433716, + "learning_rate": 0.0002, + "loss": 0.9451, + "step": 8410 + }, + { + "epoch": 7.066722618548049, + "grad_norm": 1.435689091682434, + "learning_rate": 0.0002, + "loss": 0.929, + "step": 8420 + }, + { + "epoch": 7.07511540075535, + "grad_norm": 1.4071106910705566, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 8430 + }, + { + "epoch": 7.083508182962652, + "grad_norm": 1.2787632942199707, + "learning_rate": 0.0002, + "loss": 1.0118, + "step": 8440 + }, + { + "epoch": 7.091900965169954, + "grad_norm": 1.4746837615966797, + "learning_rate": 0.0002, + "loss": 0.8974, + "step": 8450 + }, + { + "epoch": 7.100293747377256, + "grad_norm": 1.5315444469451904, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 8460 + }, + { + "epoch": 7.108686529584557, + "grad_norm": 1.3477388620376587, + "learning_rate": 0.0002, + "loss": 0.9211, + "step": 8470 + }, + { + "epoch": 7.117079311791859, + "grad_norm": 1.4741411209106445, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 8480 + }, + { + "epoch": 7.125472093999161, + "grad_norm": 1.4285027980804443, + "learning_rate": 0.0002, + "loss": 0.981, + "step": 8490 + }, + { + "epoch": 7.133864876206462, + "grad_norm": 1.4621654748916626, + "learning_rate": 0.0002, + "loss": 0.9618, + "step": 8500 + }, + { + "epoch": 7.142257658413764, + "grad_norm": 1.5798449516296387, + "learning_rate": 0.0002, + "loss": 1.009, + "step": 8510 + }, + { + "epoch": 7.150650440621066, + "grad_norm": 1.5122318267822266, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 8520 + }, + { + "epoch": 7.159043222828368, + "grad_norm": 1.1761255264282227, + "learning_rate": 0.0002, + "loss": 0.8893, + "step": 8530 + }, + { + "epoch": 7.167436005035669, + "grad_norm": 1.225748062133789, + "learning_rate": 0.0002, + "loss": 0.9484, + "step": 8540 + }, + { + "epoch": 7.175828787242971, + "grad_norm": 1.2034697532653809, + "learning_rate": 0.0002, + "loss": 0.9951, + "step": 8550 + }, + { + "epoch": 7.184221569450273, + "grad_norm": 1.3965253829956055, + "learning_rate": 0.0002, + "loss": 1.0185, + "step": 8560 + }, + { + "epoch": 7.192614351657575, + "grad_norm": 1.5653856992721558, + "learning_rate": 0.0002, + "loss": 0.9591, + "step": 8570 + }, + { + "epoch": 7.201007133864876, + "grad_norm": 1.132654070854187, + "learning_rate": 0.0002, + "loss": 0.9621, + "step": 8580 + }, + { + "epoch": 7.209399916072178, + "grad_norm": 1.563130497932434, + "learning_rate": 0.0002, + "loss": 0.9662, + "step": 8590 + }, + { + "epoch": 7.21779269827948, + "grad_norm": 1.4901666641235352, + "learning_rate": 0.0002, + "loss": 0.9575, + "step": 8600 + }, + { + "epoch": 7.226185480486781, + "grad_norm": 1.2369494438171387, + "learning_rate": 0.0002, + "loss": 0.9401, + "step": 8610 + }, + { + "epoch": 7.234578262694083, + "grad_norm": 1.2923214435577393, + "learning_rate": 0.0002, + "loss": 0.9773, + "step": 8620 + }, + { + "epoch": 7.242971044901385, + "grad_norm": 1.3038378953933716, + "learning_rate": 0.0002, + "loss": 0.9497, + "step": 8630 + }, + { + "epoch": 7.251363827108687, + "grad_norm": 1.4016213417053223, + "learning_rate": 0.0002, + "loss": 0.9361, + "step": 8640 + }, + { + "epoch": 7.259756609315988, + "grad_norm": 1.3319065570831299, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 8650 + }, + { + "epoch": 7.2681493915232895, + "grad_norm": 1.5870885848999023, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 8660 + }, + { + "epoch": 7.276542173730592, + "grad_norm": 1.269951581954956, + "learning_rate": 0.0002, + "loss": 0.8986, + "step": 8670 + }, + { + "epoch": 7.284934955937893, + "grad_norm": 1.6408095359802246, + "learning_rate": 0.0002, + "loss": 0.8962, + "step": 8680 + }, + { + "epoch": 7.293327738145195, + "grad_norm": 1.492431402206421, + "learning_rate": 0.0002, + "loss": 1.0012, + "step": 8690 + }, + { + "epoch": 7.301720520352497, + "grad_norm": 1.5359779596328735, + "learning_rate": 0.0002, + "loss": 0.9855, + "step": 8700 + }, + { + "epoch": 7.310113302559799, + "grad_norm": 1.3436894416809082, + "learning_rate": 0.0002, + "loss": 0.9732, + "step": 8710 + }, + { + "epoch": 7.3185060847671, + "grad_norm": 1.272531270980835, + "learning_rate": 0.0002, + "loss": 1.0006, + "step": 8720 + }, + { + "epoch": 7.326898866974402, + "grad_norm": 1.2252386808395386, + "learning_rate": 0.0002, + "loss": 0.9387, + "step": 8730 + }, + { + "epoch": 7.335291649181704, + "grad_norm": 1.7674977779388428, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 8740 + }, + { + "epoch": 7.343684431389005, + "grad_norm": 1.4869602918624878, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 8750 + }, + { + "epoch": 7.352077213596307, + "grad_norm": 1.7059985399246216, + "learning_rate": 0.0002, + "loss": 0.9397, + "step": 8760 + }, + { + "epoch": 7.360469995803609, + "grad_norm": 1.4273415803909302, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 8770 + }, + { + "epoch": 7.368862778010911, + "grad_norm": 1.5042296648025513, + "learning_rate": 0.0002, + "loss": 0.9991, + "step": 8780 + }, + { + "epoch": 7.377255560218212, + "grad_norm": 1.3052846193313599, + "learning_rate": 0.0002, + "loss": 0.9949, + "step": 8790 + }, + { + "epoch": 7.385648342425514, + "grad_norm": 1.2968711853027344, + "learning_rate": 0.0002, + "loss": 0.9305, + "step": 8800 + }, + { + "epoch": 7.394041124632816, + "grad_norm": 1.3339134454727173, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 8810 + }, + { + "epoch": 7.402433906840118, + "grad_norm": 1.4598830938339233, + "learning_rate": 0.0002, + "loss": 1.002, + "step": 8820 + }, + { + "epoch": 7.410826689047419, + "grad_norm": 1.408402442932129, + "learning_rate": 0.0002, + "loss": 1.0351, + "step": 8830 + }, + { + "epoch": 7.419219471254721, + "grad_norm": 1.515499472618103, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 8840 + }, + { + "epoch": 7.427612253462023, + "grad_norm": 1.4303524494171143, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 8850 + }, + { + "epoch": 7.436005035669324, + "grad_norm": 1.2982665300369263, + "learning_rate": 0.0002, + "loss": 1.0585, + "step": 8860 + }, + { + "epoch": 7.444397817876626, + "grad_norm": 1.300026774406433, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 8870 + }, + { + "epoch": 7.452790600083928, + "grad_norm": 1.4231666326522827, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 8880 + }, + { + "epoch": 7.46118338229123, + "grad_norm": 1.3485242128372192, + "learning_rate": 0.0002, + "loss": 1.0629, + "step": 8890 + }, + { + "epoch": 7.469576164498531, + "grad_norm": 1.3709967136383057, + "learning_rate": 0.0002, + "loss": 0.9812, + "step": 8900 + }, + { + "epoch": 7.477968946705833, + "grad_norm": 1.440061330795288, + "learning_rate": 0.0002, + "loss": 0.9762, + "step": 8910 + }, + { + "epoch": 7.486361728913135, + "grad_norm": 1.35463547706604, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 8920 + }, + { + "epoch": 7.494754511120436, + "grad_norm": 1.4464876651763916, + "learning_rate": 0.0002, + "loss": 1.0676, + "step": 8930 + }, + { + "epoch": 7.503147293327738, + "grad_norm": 1.3082282543182373, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 8940 + }, + { + "epoch": 7.51154007553504, + "grad_norm": 1.5687413215637207, + "learning_rate": 0.0002, + "loss": 1.0189, + "step": 8950 + }, + { + "epoch": 7.519932857742342, + "grad_norm": 1.3017815351486206, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 8960 + }, + { + "epoch": 7.528325639949643, + "grad_norm": 1.3839282989501953, + "learning_rate": 0.0002, + "loss": 1.1089, + "step": 8970 + }, + { + "epoch": 7.5367184221569445, + "grad_norm": 1.4667741060256958, + "learning_rate": 0.0002, + "loss": 1.0415, + "step": 8980 + }, + { + "epoch": 7.545111204364247, + "grad_norm": 1.3954358100891113, + "learning_rate": 0.0002, + "loss": 0.9877, + "step": 8990 + }, + { + "epoch": 7.553503986571548, + "grad_norm": 1.2745059728622437, + "learning_rate": 0.0002, + "loss": 0.9216, + "step": 9000 + }, + { + "epoch": 7.56189676877885, + "grad_norm": 1.3012958765029907, + "learning_rate": 0.0002, + "loss": 0.9868, + "step": 9010 + }, + { + "epoch": 7.570289550986152, + "grad_norm": 1.4432767629623413, + "learning_rate": 0.0002, + "loss": 0.9691, + "step": 9020 + }, + { + "epoch": 7.578682333193454, + "grad_norm": 1.3510358333587646, + "learning_rate": 0.0002, + "loss": 1.0086, + "step": 9030 + }, + { + "epoch": 7.587075115400755, + "grad_norm": 1.331549048423767, + "learning_rate": 0.0002, + "loss": 1.0167, + "step": 9040 + }, + { + "epoch": 7.595467897608057, + "grad_norm": 1.4031989574432373, + "learning_rate": 0.0002, + "loss": 0.9904, + "step": 9050 + }, + { + "epoch": 7.603860679815359, + "grad_norm": 1.3684027194976807, + "learning_rate": 0.0002, + "loss": 0.9486, + "step": 9060 + }, + { + "epoch": 7.612253462022661, + "grad_norm": 1.5346373319625854, + "learning_rate": 0.0002, + "loss": 1.0284, + "step": 9070 + }, + { + "epoch": 7.620646244229962, + "grad_norm": 1.4921435117721558, + "learning_rate": 0.0002, + "loss": 0.9843, + "step": 9080 + }, + { + "epoch": 7.629039026437264, + "grad_norm": 1.3445239067077637, + "learning_rate": 0.0002, + "loss": 0.9853, + "step": 9090 + }, + { + "epoch": 7.637431808644566, + "grad_norm": 1.4929054975509644, + "learning_rate": 0.0002, + "loss": 1.0377, + "step": 9100 + }, + { + "epoch": 7.645824590851867, + "grad_norm": 1.3410874605178833, + "learning_rate": 0.0002, + "loss": 0.9422, + "step": 9110 + }, + { + "epoch": 7.654217373059169, + "grad_norm": 1.343114972114563, + "learning_rate": 0.0002, + "loss": 1.0323, + "step": 9120 + }, + { + "epoch": 7.662610155266471, + "grad_norm": 1.424418568611145, + "learning_rate": 0.0002, + "loss": 0.9945, + "step": 9130 + }, + { + "epoch": 7.671002937473773, + "grad_norm": 1.3746715784072876, + "learning_rate": 0.0002, + "loss": 0.9923, + "step": 9140 + }, + { + "epoch": 7.679395719681074, + "grad_norm": 1.1734800338745117, + "learning_rate": 0.0002, + "loss": 1.053, + "step": 9150 + }, + { + "epoch": 7.687788501888376, + "grad_norm": 1.4013954401016235, + "learning_rate": 0.0002, + "loss": 1.0328, + "step": 9160 + }, + { + "epoch": 7.696181284095678, + "grad_norm": 1.3568707704544067, + "learning_rate": 0.0002, + "loss": 1.0566, + "step": 9170 + }, + { + "epoch": 7.704574066302979, + "grad_norm": 1.3949618339538574, + "learning_rate": 0.0002, + "loss": 1.0157, + "step": 9180 + }, + { + "epoch": 7.712966848510281, + "grad_norm": 1.4103217124938965, + "learning_rate": 0.0002, + "loss": 1.0468, + "step": 9190 + }, + { + "epoch": 7.721359630717583, + "grad_norm": 1.3260635137557983, + "learning_rate": 0.0002, + "loss": 0.9251, + "step": 9200 + }, + { + "epoch": 7.729752412924885, + "grad_norm": 1.316851019859314, + "learning_rate": 0.0002, + "loss": 1.035, + "step": 9210 + }, + { + "epoch": 7.738145195132186, + "grad_norm": 1.2649954557418823, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 9220 + }, + { + "epoch": 7.746537977339488, + "grad_norm": 1.2904008626937866, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 9230 + }, + { + "epoch": 7.75493075954679, + "grad_norm": 1.6231776475906372, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 9240 + }, + { + "epoch": 7.763323541754091, + "grad_norm": 1.4072569608688354, + "learning_rate": 0.0002, + "loss": 1.0586, + "step": 9250 + }, + { + "epoch": 7.771716323961393, + "grad_norm": 1.4019498825073242, + "learning_rate": 0.0002, + "loss": 0.982, + "step": 9260 + }, + { + "epoch": 7.780109106168695, + "grad_norm": 1.354575276374817, + "learning_rate": 0.0002, + "loss": 1.0308, + "step": 9270 + }, + { + "epoch": 7.788501888375997, + "grad_norm": 1.1940326690673828, + "learning_rate": 0.0002, + "loss": 0.9984, + "step": 9280 + }, + { + "epoch": 7.796894670583298, + "grad_norm": 1.5169446468353271, + "learning_rate": 0.0002, + "loss": 1.004, + "step": 9290 + }, + { + "epoch": 7.8052874527906, + "grad_norm": 1.5126844644546509, + "learning_rate": 0.0002, + "loss": 1.0822, + "step": 9300 + }, + { + "epoch": 7.813680234997902, + "grad_norm": 1.3362282514572144, + "learning_rate": 0.0002, + "loss": 1.0647, + "step": 9310 + }, + { + "epoch": 7.822073017205204, + "grad_norm": 1.505102515220642, + "learning_rate": 0.0002, + "loss": 1.0294, + "step": 9320 + }, + { + "epoch": 7.830465799412505, + "grad_norm": 1.3281409740447998, + "learning_rate": 0.0002, + "loss": 1.0402, + "step": 9330 + }, + { + "epoch": 7.838858581619807, + "grad_norm": 1.6044951677322388, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 9340 + }, + { + "epoch": 7.847251363827109, + "grad_norm": 1.4066485166549683, + "learning_rate": 0.0002, + "loss": 1.0579, + "step": 9350 + }, + { + "epoch": 7.85564414603441, + "grad_norm": 1.3862172365188599, + "learning_rate": 0.0002, + "loss": 1.0726, + "step": 9360 + }, + { + "epoch": 7.864036928241712, + "grad_norm": 1.6576231718063354, + "learning_rate": 0.0002, + "loss": 1.0363, + "step": 9370 + }, + { + "epoch": 7.872429710449014, + "grad_norm": 1.6516666412353516, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 9380 + }, + { + "epoch": 7.880822492656316, + "grad_norm": 1.4599813222885132, + "learning_rate": 0.0002, + "loss": 1.0372, + "step": 9390 + }, + { + "epoch": 7.889215274863617, + "grad_norm": 1.3877774477005005, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 9400 + }, + { + "epoch": 7.897608057070919, + "grad_norm": 1.3922977447509766, + "learning_rate": 0.0002, + "loss": 1.0389, + "step": 9410 + }, + { + "epoch": 7.906000839278221, + "grad_norm": 1.368686556816101, + "learning_rate": 0.0002, + "loss": 1.0022, + "step": 9420 + }, + { + "epoch": 7.914393621485522, + "grad_norm": 1.4226235151290894, + "learning_rate": 0.0002, + "loss": 1.0892, + "step": 9430 + }, + { + "epoch": 7.922786403692824, + "grad_norm": 1.629234790802002, + "learning_rate": 0.0002, + "loss": 1.053, + "step": 9440 + }, + { + "epoch": 7.931179185900126, + "grad_norm": 1.5644806623458862, + "learning_rate": 0.0002, + "loss": 1.0277, + "step": 9450 + }, + { + "epoch": 7.939571968107428, + "grad_norm": 1.1915444135665894, + "learning_rate": 0.0002, + "loss": 1.0567, + "step": 9460 + }, + { + "epoch": 7.947964750314729, + "grad_norm": 1.3066319227218628, + "learning_rate": 0.0002, + "loss": 1.073, + "step": 9470 + }, + { + "epoch": 7.956357532522031, + "grad_norm": 1.2318781614303589, + "learning_rate": 0.0002, + "loss": 1.0097, + "step": 9480 + }, + { + "epoch": 7.964750314729333, + "grad_norm": 1.558817982673645, + "learning_rate": 0.0002, + "loss": 1.0836, + "step": 9490 + }, + { + "epoch": 7.973143096936634, + "grad_norm": 1.2839301824569702, + "learning_rate": 0.0002, + "loss": 1.0311, + "step": 9500 + }, + { + "epoch": 7.981535879143936, + "grad_norm": 1.2938915491104126, + "learning_rate": 0.0002, + "loss": 1.0475, + "step": 9510 + }, + { + "epoch": 7.989928661351238, + "grad_norm": 1.4090218544006348, + "learning_rate": 0.0002, + "loss": 1.0254, + "step": 9520 + }, + { + "epoch": 7.996642887117079, + "eval_loss": 2.3210320472717285, + "eval_runtime": 38.2073, + "eval_samples_per_second": 13.479, + "eval_steps_per_second": 1.701, + "step": 9528 + } + ], + "logging_steps": 10, + "max_steps": 9528, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.4093450594392474e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-9528/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eca8ee269bfcdec21ad5bac19e775efc313c37db --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79c1fd4bf53987c6f3124607286bebbc43d4948b42274b3d15181ff573f7d689 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..dd8f5aa5b9f8f7bf05732ae7aea47dd6f17f50cb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9995803608896349, "step": 1191, "epoch_duration": 1320.4731423854828, "total_accumulated_duration": 1320.4731423854828, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}]} +{"epoch": 2.0, "step": 2383, "epoch_duration": 1298.960376739502, "total_accumulated_duration": 2619.4335191249847, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-1191", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}, {"eval_loss": 1.8086129426956177, "eval_runtime": 38.0431, "eval_samples_per_second": 13.537, "eval_steps_per_second": 1.709, "epoch": 0.9995803608896349, "step": 1191}, {"loss": 1.6856, "grad_norm": 0.3235187232494354, "learning_rate": 0.0002, "epoch": 1.0071338648762065, "step": 1200}, {"loss": 1.7121, "grad_norm": 0.34884774684906006, "learning_rate": 0.0002, "epoch": 1.0155266470835083, "step": 1210}, {"loss": 1.6779, "grad_norm": 0.3215438425540924, "learning_rate": 0.0002, "epoch": 1.0239194292908098, "step": 1220}, {"loss": 1.6562, "grad_norm": 0.312084823846817, "learning_rate": 0.0002, "epoch": 1.0323122114981116, "step": 1230}, {"loss": 1.7366, "grad_norm": 0.33597758412361145, "learning_rate": 0.0002, "epoch": 1.0407049937054134, "step": 1240}, {"loss": 1.7245, "grad_norm": 0.3421499729156494, "learning_rate": 0.0002, "epoch": 1.0490977759127151, "step": 1250}, {"loss": 1.7331, "grad_norm": 0.3458889126777649, "learning_rate": 0.0002, "epoch": 1.0574905581200167, "step": 1260}, {"loss": 1.6929, "grad_norm": 0.3956579864025116, "learning_rate": 0.0002, "epoch": 1.0658833403273185, "step": 1270}, {"loss": 1.6625, "grad_norm": 0.3217819035053253, "learning_rate": 0.0002, "epoch": 1.0742761225346202, "step": 1280}, {"loss": 1.7488, "grad_norm": 0.31379663944244385, "learning_rate": 0.0002, "epoch": 1.082668904741922, "step": 1290}, {"loss": 1.6331, "grad_norm": 0.37231558561325073, "learning_rate": 0.0002, "epoch": 1.0910616869492236, "step": 1300}, {"loss": 1.6614, "grad_norm": 0.35857918858528137, "learning_rate": 0.0002, "epoch": 1.0994544691565253, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.36637991666793823, "learning_rate": 0.0002, "epoch": 1.1078472513638271, "step": 1320}, {"loss": 1.7245, "grad_norm": 0.3436494469642639, "learning_rate": 0.0002, "epoch": 1.1162400335711289, "step": 1330}, {"loss": 1.6867, "grad_norm": 0.404908150434494, "learning_rate": 0.0002, "epoch": 1.1246328157784307, "step": 1340}, {"loss": 1.7042, "grad_norm": 0.34587544202804565, "learning_rate": 0.0002, "epoch": 1.1330255979857322, "step": 1350}, {"loss": 1.6365, "grad_norm": 0.35142362117767334, "learning_rate": 0.0002, "epoch": 1.141418380193034, "step": 1360}, {"loss": 1.6781, "grad_norm": 0.3511804938316345, "learning_rate": 0.0002, "epoch": 1.1498111624003358, "step": 1370}, {"loss": 1.6824, "grad_norm": 0.3549560308456421, "learning_rate": 0.0002, "epoch": 1.1582039446076373, "step": 1380}, {"loss": 1.7276, "grad_norm": 0.35797521471977234, "learning_rate": 0.0002, "epoch": 1.166596726814939, "step": 1390}, {"loss": 1.7476, "grad_norm": 0.37255269289016724, "learning_rate": 0.0002, "epoch": 1.1749895090222409, "step": 1400}, {"loss": 1.7274, "grad_norm": 0.3680652379989624, "learning_rate": 0.0002, "epoch": 1.1833822912295426, "step": 1410}, {"loss": 1.6751, "grad_norm": 0.400831013917923, "learning_rate": 0.0002, "epoch": 1.1917750734368444, "step": 1420}, {"loss": 1.7961, "grad_norm": 0.39571020007133484, "learning_rate": 0.0002, "epoch": 1.200167855644146, "step": 1430}, {"loss": 1.792, "grad_norm": 0.3843863010406494, "learning_rate": 0.0002, "epoch": 1.2085606378514477, "step": 1440}, {"loss": 1.7072, "grad_norm": 0.3901960551738739, "learning_rate": 0.0002, "epoch": 1.2169534200587495, "step": 1450}, {"loss": 1.6425, "grad_norm": 0.36490726470947266, "learning_rate": 0.0002, "epoch": 1.2253462022660513, "step": 1460}, {"loss": 1.6995, "grad_norm": 0.3739864230155945, "learning_rate": 0.0002, "epoch": 1.2337389844733528, "step": 1470}, {"loss": 1.6795, "grad_norm": 0.39061254262924194, "learning_rate": 0.0002, "epoch": 1.2421317666806546, "step": 1480}, {"loss": 1.6838, "grad_norm": 0.37198659777641296, "learning_rate": 0.0002, "epoch": 1.2505245488879564, "step": 1490}, {"loss": 1.725, "grad_norm": 0.3420586884021759, "learning_rate": 0.0002, "epoch": 1.2589173310952582, "step": 1500}, {"loss": 1.719, "grad_norm": 0.4094347655773163, "learning_rate": 0.0002, "epoch": 1.2673101133025597, "step": 1510}, {"loss": 1.7563, "grad_norm": 0.38997703790664673, "learning_rate": 0.0002, "epoch": 1.2757028955098615, "step": 1520}, {"loss": 1.6651, "grad_norm": 0.35702022910118103, "learning_rate": 0.0002, "epoch": 1.2840956777171633, "step": 1530}, {"loss": 1.6689, "grad_norm": 0.3892163336277008, "learning_rate": 0.0002, "epoch": 1.292488459924465, "step": 1540}, {"loss": 1.7209, "grad_norm": 0.33174318075180054, "learning_rate": 0.0002, "epoch": 1.3008812421317666, "step": 1550}, {"loss": 1.7581, "grad_norm": 0.40701809525489807, "learning_rate": 0.0002, "epoch": 1.3092740243390684, "step": 1560}, {"loss": 1.7229, "grad_norm": 0.36324232816696167, "learning_rate": 0.0002, "epoch": 1.3176668065463701, "step": 1570}, {"loss": 1.6708, "grad_norm": 0.3748789429664612, "learning_rate": 0.0002, "epoch": 1.326059588753672, "step": 1580}, {"loss": 1.67, "grad_norm": 0.40873438119888306, "learning_rate": 0.0002, "epoch": 1.3344523709609737, "step": 1590}, {"loss": 1.7909, "grad_norm": 0.52373206615448, "learning_rate": 0.0002, "epoch": 1.3428451531682752, "step": 1600}, {"loss": 1.7593, "grad_norm": 0.40408164262771606, "learning_rate": 0.0002, "epoch": 1.351237935375577, "step": 1610}, {"loss": 1.7959, "grad_norm": 0.3818126320838928, "learning_rate": 0.0002, "epoch": 1.3596307175828788, "step": 1620}, {"loss": 1.6328, "grad_norm": 0.3457068204879761, "learning_rate": 0.0002, "epoch": 1.3680234997901803, "step": 1630}, {"loss": 1.7017, "grad_norm": 0.33777865767478943, "learning_rate": 0.0002, "epoch": 1.3764162819974821, "step": 1640}, {"loss": 1.7335, "grad_norm": 0.36344218254089355, "learning_rate": 0.0002, "epoch": 1.384809064204784, "step": 1650}, {"loss": 1.7656, "grad_norm": 0.3880128562450409, "learning_rate": 0.0002, "epoch": 1.3932018464120857, "step": 1660}, {"loss": 1.7377, "grad_norm": 0.3906225562095642, "learning_rate": 0.0002, "epoch": 1.4015946286193874, "step": 1670}, {"loss": 1.7041, "grad_norm": 0.35857489705085754, "learning_rate": 0.0002, "epoch": 1.409987410826689, "step": 1680}, {"loss": 1.7175, "grad_norm": 0.3627418279647827, "learning_rate": 0.0002, "epoch": 1.4183801930339908, "step": 1690}, {"loss": 1.6948, "grad_norm": 0.41963326930999756, "learning_rate": 0.0002, "epoch": 1.4267729752412925, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.36280378699302673, "learning_rate": 0.0002, "epoch": 1.435165757448594, "step": 1710}, {"loss": 1.7775, "grad_norm": 0.3868233561515808, "learning_rate": 0.0002, "epoch": 1.4435585396558959, "step": 1720}, {"loss": 1.6963, "grad_norm": 0.3635849356651306, "learning_rate": 0.0002, "epoch": 1.4519513218631976, "step": 1730}, {"loss": 1.7381, "grad_norm": 0.4885194003582001, "learning_rate": 0.0002, "epoch": 1.4603441040704994, "step": 1740}, {"loss": 1.6661, "grad_norm": 0.35194680094718933, "learning_rate": 0.0002, "epoch": 1.4687368862778012, "step": 1750}, {"loss": 1.7841, "grad_norm": 0.34906691312789917, "learning_rate": 0.0002, "epoch": 1.4771296684851027, "step": 1760}, {"loss": 1.7196, "grad_norm": 0.3994184732437134, "learning_rate": 0.0002, "epoch": 1.4855224506924045, "step": 1770}, {"loss": 1.7157, "grad_norm": 0.3599298298358917, "learning_rate": 0.0002, "epoch": 1.4939152328997063, "step": 1780}, {"loss": 1.6966, "grad_norm": 0.3794984221458435, "learning_rate": 0.0002, "epoch": 1.5023080151070078, "step": 1790}, {"loss": 1.7187, "grad_norm": 0.36289724707603455, "learning_rate": 0.0002, "epoch": 1.5107007973143096, "step": 1800}, {"loss": 1.78, "grad_norm": 0.38057321310043335, "learning_rate": 0.0002, "epoch": 1.5190935795216114, "step": 1810}, {"loss": 1.7006, "grad_norm": 0.3771969676017761, "learning_rate": 0.0002, "epoch": 1.5274863617289132, "step": 1820}, {"loss": 1.765, "grad_norm": 0.34788841009140015, "learning_rate": 0.0002, "epoch": 1.535879143936215, "step": 1830}, {"loss": 1.7148, "grad_norm": 0.41352227330207825, "learning_rate": 0.0002, "epoch": 1.5442719261435167, "step": 1840}, {"loss": 1.6654, "grad_norm": 0.35711410641670227, "learning_rate": 0.0002, "epoch": 1.5526647083508183, "step": 1850}, {"loss": 1.6998, "grad_norm": 0.40607622265815735, "learning_rate": 0.0002, "epoch": 1.56105749055812, "step": 1860}, {"loss": 1.713, "grad_norm": 0.3428550660610199, "learning_rate": 0.0002, "epoch": 1.5694502727654216, "step": 1870}, {"loss": 1.7909, "grad_norm": 0.3695414066314697, "learning_rate": 0.0002, "epoch": 1.5778430549727234, "step": 1880}, {"loss": 1.6629, "grad_norm": 0.3798272907733917, "learning_rate": 0.0002, "epoch": 1.5862358371800251, "step": 1890}, {"loss": 1.7412, "grad_norm": 0.3415829837322235, "learning_rate": 0.0002, "epoch": 1.594628619387327, "step": 1900}, {"loss": 1.8233, "grad_norm": 0.3575693666934967, "learning_rate": 0.0002, "epoch": 1.6030214015946287, "step": 1910}, {"loss": 1.6947, "grad_norm": 0.3180370628833771, "learning_rate": 0.0002, "epoch": 1.6114141838019305, "step": 1920}, {"loss": 1.7506, "grad_norm": 0.5018689036369324, "learning_rate": 0.0002, "epoch": 1.619806966009232, "step": 1930}, {"loss": 1.7368, "grad_norm": 0.35676372051239014, "learning_rate": 0.0002, "epoch": 1.6281997482165338, "step": 1940}, {"loss": 1.7159, "grad_norm": 0.3740452229976654, "learning_rate": 0.0002, "epoch": 1.6365925304238353, "step": 1950}, {"loss": 1.6474, "grad_norm": 0.36584731936454773, "learning_rate": 0.0002, "epoch": 1.6449853126311371, "step": 1960}, {"loss": 1.7306, "grad_norm": 0.38556376099586487, "learning_rate": 0.0002, "epoch": 1.653378094838439, "step": 1970}, {"loss": 1.7694, "grad_norm": 0.4114968776702881, "learning_rate": 0.0002, "epoch": 1.6617708770457407, "step": 1980}, {"loss": 1.6407, "grad_norm": 0.3665498197078705, "learning_rate": 0.0002, "epoch": 1.6701636592530424, "step": 1990}, {"loss": 1.7167, "grad_norm": 0.36579379439353943, "learning_rate": 0.0002, "epoch": 1.6785564414603442, "step": 2000}, {"loss": 1.7637, "grad_norm": 0.3813064694404602, "learning_rate": 0.0002, "epoch": 1.6869492236676458, "step": 2010}, {"loss": 1.7566, "grad_norm": 0.33390694856643677, "learning_rate": 0.0002, "epoch": 1.6953420058749475, "step": 2020}, {"loss": 1.6576, "grad_norm": 0.3668614327907562, "learning_rate": 0.0002, "epoch": 1.7037347880822493, "step": 2030}, {"loss": 1.7162, "grad_norm": 0.352028489112854, "learning_rate": 0.0002, "epoch": 1.7121275702895509, "step": 2040}, {"loss": 1.727, "grad_norm": 0.33639830350875854, "learning_rate": 0.0002, "epoch": 1.7205203524968526, "step": 2050}, {"loss": 1.7868, "grad_norm": 0.39217695593833923, "learning_rate": 0.0002, "epoch": 1.7289131347041544, "step": 2060}, {"loss": 1.7608, "grad_norm": 0.42593324184417725, "learning_rate": 0.0002, "epoch": 1.7373059169114562, "step": 2070}, {"loss": 1.722, "grad_norm": 0.362215518951416, "learning_rate": 0.0002, "epoch": 1.745698699118758, "step": 2080}, {"loss": 1.7712, "grad_norm": 0.4087955057621002, "learning_rate": 0.0002, "epoch": 1.7540914813260597, "step": 2090}, {"loss": 1.6414, "grad_norm": 0.35127750039100647, "learning_rate": 0.0002, "epoch": 1.7624842635333613, "step": 2100}, {"loss": 1.7405, "grad_norm": 0.33677494525909424, "learning_rate": 0.0002, "epoch": 1.770877045740663, "step": 2110}, {"loss": 1.7478, "grad_norm": 0.39616644382476807, "learning_rate": 0.0002, "epoch": 1.7792698279479646, "step": 2120}, {"loss": 1.8068, "grad_norm": 0.4705100953578949, "learning_rate": 0.0002, "epoch": 1.7876626101552664, "step": 2130}, {"loss": 1.75, "grad_norm": 0.3893914818763733, "learning_rate": 0.0002, "epoch": 1.7960553923625682, "step": 2140}, {"loss": 1.6711, "grad_norm": 0.3344813585281372, "learning_rate": 0.0002, "epoch": 1.80444817456987, "step": 2150}, {"loss": 1.8329, "grad_norm": 0.36502110958099365, "learning_rate": 0.0002, "epoch": 1.8128409567771717, "step": 2160}, {"loss": 1.753, "grad_norm": 0.3422985374927521, "learning_rate": 0.0002, "epoch": 1.8212337389844735, "step": 2170}, {"loss": 1.6874, "grad_norm": 0.44039851427078247, "learning_rate": 0.0002, "epoch": 1.829626521191775, "step": 2180}, {"loss": 1.7706, "grad_norm": 0.40052926540374756, "learning_rate": 0.0002, "epoch": 1.8380193033990768, "step": 2190}, {"loss": 1.7551, "grad_norm": 0.3614487648010254, "learning_rate": 0.0002, "epoch": 1.8464120856063784, "step": 2200}, {"loss": 1.6879, "grad_norm": 0.3800305426120758, "learning_rate": 0.0002, "epoch": 1.8548048678136801, "step": 2210}, {"loss": 1.7731, "grad_norm": 0.3942040205001831, "learning_rate": 0.0002, "epoch": 1.863197650020982, "step": 2220}, {"loss": 1.7187, "grad_norm": 0.36896875500679016, "learning_rate": 0.0002, "epoch": 1.8715904322282837, "step": 2230}, {"loss": 1.7371, "grad_norm": 0.3666089177131653, "learning_rate": 0.0002, "epoch": 1.8799832144355855, "step": 2240}, {"loss": 1.7336, "grad_norm": 0.3759142756462097, "learning_rate": 0.0002, "epoch": 1.8883759966428872, "step": 2250}, {"loss": 1.7243, "grad_norm": 0.3711695671081543, "learning_rate": 0.0002, "epoch": 1.8967687788501888, "step": 2260}, {"loss": 1.7052, "grad_norm": 0.37000006437301636, "learning_rate": 0.0002, "epoch": 1.9051615610574906, "step": 2270}, {"loss": 1.7104, "grad_norm": 0.37376025319099426, "learning_rate": 0.0002, "epoch": 1.9135543432647921, "step": 2280}, {"loss": 1.6641, "grad_norm": 0.3794068694114685, "learning_rate": 0.0002, "epoch": 1.921947125472094, "step": 2290}, {"loss": 1.7693, "grad_norm": 0.42530709505081177, "learning_rate": 0.0002, "epoch": 1.9303399076793957, "step": 2300}, {"loss": 1.7871, "grad_norm": 0.3381672203540802, "learning_rate": 0.0002, "epoch": 1.9387326898866974, "step": 2310}, {"loss": 1.7502, "grad_norm": 0.3553236722946167, "learning_rate": 0.0002, "epoch": 1.9471254720939992, "step": 2320}, {"loss": 1.715, "grad_norm": 0.38204774260520935, "learning_rate": 0.0002, "epoch": 1.955518254301301, "step": 2330}, {"loss": 1.7088, "grad_norm": 0.4318946301937103, "learning_rate": 0.0002, "epoch": 1.9639110365086025, "step": 2340}, {"loss": 1.7709, "grad_norm": 0.3563119173049927, "learning_rate": 0.0002, "epoch": 1.9723038187159043, "step": 2350}, {"loss": 1.7083, "grad_norm": 0.362532377243042, "learning_rate": 0.0002, "epoch": 1.980696600923206, "step": 2360}, {"loss": 1.6992, "grad_norm": 0.40200483798980713, "learning_rate": 0.0002, "epoch": 1.9890893831305076, "step": 2370}, {"loss": 1.7622, "grad_norm": 0.37397003173828125, "learning_rate": 0.0002, "epoch": 1.9974821653378094, "step": 2380}]} +{"epoch": 2.999580360889635, "step": 3574, "epoch_duration": 1282.171140909195, "total_accumulated_duration": 3901.6046600341797, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}, {"eval_loss": 1.8086129426956177, "eval_runtime": 38.0431, "eval_samples_per_second": 13.537, "eval_steps_per_second": 1.709, "epoch": 0.9995803608896349, "step": 1191}, {"loss": 1.6856, "grad_norm": 0.3235187232494354, "learning_rate": 0.0002, "epoch": 1.0071338648762065, "step": 1200}, {"loss": 1.7121, "grad_norm": 0.34884774684906006, "learning_rate": 0.0002, "epoch": 1.0155266470835083, "step": 1210}, {"loss": 1.6779, "grad_norm": 0.3215438425540924, "learning_rate": 0.0002, "epoch": 1.0239194292908098, "step": 1220}, {"loss": 1.6562, "grad_norm": 0.312084823846817, "learning_rate": 0.0002, "epoch": 1.0323122114981116, "step": 1230}, {"loss": 1.7366, "grad_norm": 0.33597758412361145, "learning_rate": 0.0002, "epoch": 1.0407049937054134, "step": 1240}, {"loss": 1.7245, "grad_norm": 0.3421499729156494, "learning_rate": 0.0002, "epoch": 1.0490977759127151, "step": 1250}, {"loss": 1.7331, "grad_norm": 0.3458889126777649, "learning_rate": 0.0002, "epoch": 1.0574905581200167, "step": 1260}, {"loss": 1.6929, "grad_norm": 0.3956579864025116, "learning_rate": 0.0002, "epoch": 1.0658833403273185, "step": 1270}, {"loss": 1.6625, "grad_norm": 0.3217819035053253, "learning_rate": 0.0002, "epoch": 1.0742761225346202, "step": 1280}, {"loss": 1.7488, "grad_norm": 0.31379663944244385, "learning_rate": 0.0002, "epoch": 1.082668904741922, "step": 1290}, {"loss": 1.6331, "grad_norm": 0.37231558561325073, "learning_rate": 0.0002, "epoch": 1.0910616869492236, "step": 1300}, {"loss": 1.6614, "grad_norm": 0.35857918858528137, "learning_rate": 0.0002, "epoch": 1.0994544691565253, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.36637991666793823, "learning_rate": 0.0002, "epoch": 1.1078472513638271, "step": 1320}, {"loss": 1.7245, "grad_norm": 0.3436494469642639, "learning_rate": 0.0002, "epoch": 1.1162400335711289, "step": 1330}, {"loss": 1.6867, "grad_norm": 0.404908150434494, "learning_rate": 0.0002, "epoch": 1.1246328157784307, "step": 1340}, {"loss": 1.7042, "grad_norm": 0.34587544202804565, "learning_rate": 0.0002, "epoch": 1.1330255979857322, "step": 1350}, {"loss": 1.6365, "grad_norm": 0.35142362117767334, "learning_rate": 0.0002, "epoch": 1.141418380193034, "step": 1360}, {"loss": 1.6781, "grad_norm": 0.3511804938316345, "learning_rate": 0.0002, "epoch": 1.1498111624003358, "step": 1370}, {"loss": 1.6824, "grad_norm": 0.3549560308456421, "learning_rate": 0.0002, "epoch": 1.1582039446076373, "step": 1380}, {"loss": 1.7276, "grad_norm": 0.35797521471977234, "learning_rate": 0.0002, "epoch": 1.166596726814939, "step": 1390}, {"loss": 1.7476, "grad_norm": 0.37255269289016724, "learning_rate": 0.0002, "epoch": 1.1749895090222409, "step": 1400}, {"loss": 1.7274, "grad_norm": 0.3680652379989624, "learning_rate": 0.0002, "epoch": 1.1833822912295426, "step": 1410}, {"loss": 1.6751, "grad_norm": 0.400831013917923, "learning_rate": 0.0002, "epoch": 1.1917750734368444, "step": 1420}, {"loss": 1.7961, "grad_norm": 0.39571020007133484, "learning_rate": 0.0002, "epoch": 1.200167855644146, "step": 1430}, {"loss": 1.792, "grad_norm": 0.3843863010406494, "learning_rate": 0.0002, "epoch": 1.2085606378514477, "step": 1440}, {"loss": 1.7072, "grad_norm": 0.3901960551738739, "learning_rate": 0.0002, "epoch": 1.2169534200587495, "step": 1450}, {"loss": 1.6425, "grad_norm": 0.36490726470947266, "learning_rate": 0.0002, "epoch": 1.2253462022660513, "step": 1460}, {"loss": 1.6995, "grad_norm": 0.3739864230155945, "learning_rate": 0.0002, "epoch": 1.2337389844733528, "step": 1470}, {"loss": 1.6795, "grad_norm": 0.39061254262924194, "learning_rate": 0.0002, "epoch": 1.2421317666806546, "step": 1480}, {"loss": 1.6838, "grad_norm": 0.37198659777641296, "learning_rate": 0.0002, "epoch": 1.2505245488879564, "step": 1490}, {"loss": 1.725, "grad_norm": 0.3420586884021759, "learning_rate": 0.0002, "epoch": 1.2589173310952582, "step": 1500}, {"loss": 1.719, "grad_norm": 0.4094347655773163, "learning_rate": 0.0002, "epoch": 1.2673101133025597, "step": 1510}, {"loss": 1.7563, "grad_norm": 0.38997703790664673, "learning_rate": 0.0002, "epoch": 1.2757028955098615, "step": 1520}, {"loss": 1.6651, "grad_norm": 0.35702022910118103, "learning_rate": 0.0002, "epoch": 1.2840956777171633, "step": 1530}, {"loss": 1.6689, "grad_norm": 0.3892163336277008, "learning_rate": 0.0002, "epoch": 1.292488459924465, "step": 1540}, {"loss": 1.7209, "grad_norm": 0.33174318075180054, "learning_rate": 0.0002, "epoch": 1.3008812421317666, "step": 1550}, {"loss": 1.7581, "grad_norm": 0.40701809525489807, "learning_rate": 0.0002, "epoch": 1.3092740243390684, "step": 1560}, {"loss": 1.7229, "grad_norm": 0.36324232816696167, "learning_rate": 0.0002, "epoch": 1.3176668065463701, "step": 1570}, {"loss": 1.6708, "grad_norm": 0.3748789429664612, "learning_rate": 0.0002, "epoch": 1.326059588753672, "step": 1580}, {"loss": 1.67, "grad_norm": 0.40873438119888306, "learning_rate": 0.0002, "epoch": 1.3344523709609737, "step": 1590}, {"loss": 1.7909, "grad_norm": 0.52373206615448, "learning_rate": 0.0002, "epoch": 1.3428451531682752, "step": 1600}, {"loss": 1.7593, "grad_norm": 0.40408164262771606, "learning_rate": 0.0002, "epoch": 1.351237935375577, "step": 1610}, {"loss": 1.7959, "grad_norm": 0.3818126320838928, "learning_rate": 0.0002, "epoch": 1.3596307175828788, "step": 1620}, {"loss": 1.6328, "grad_norm": 0.3457068204879761, "learning_rate": 0.0002, "epoch": 1.3680234997901803, "step": 1630}, {"loss": 1.7017, "grad_norm": 0.33777865767478943, "learning_rate": 0.0002, "epoch": 1.3764162819974821, "step": 1640}, {"loss": 1.7335, "grad_norm": 0.36344218254089355, "learning_rate": 0.0002, "epoch": 1.384809064204784, "step": 1650}, {"loss": 1.7656, "grad_norm": 0.3880128562450409, "learning_rate": 0.0002, "epoch": 1.3932018464120857, "step": 1660}, {"loss": 1.7377, "grad_norm": 0.3906225562095642, "learning_rate": 0.0002, "epoch": 1.4015946286193874, "step": 1670}, {"loss": 1.7041, "grad_norm": 0.35857489705085754, "learning_rate": 0.0002, "epoch": 1.409987410826689, "step": 1680}, {"loss": 1.7175, "grad_norm": 0.3627418279647827, "learning_rate": 0.0002, "epoch": 1.4183801930339908, "step": 1690}, {"loss": 1.6948, "grad_norm": 0.41963326930999756, "learning_rate": 0.0002, "epoch": 1.4267729752412925, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.36280378699302673, "learning_rate": 0.0002, "epoch": 1.435165757448594, "step": 1710}, {"loss": 1.7775, "grad_norm": 0.3868233561515808, "learning_rate": 0.0002, "epoch": 1.4435585396558959, "step": 1720}, {"loss": 1.6963, "grad_norm": 0.3635849356651306, "learning_rate": 0.0002, "epoch": 1.4519513218631976, "step": 1730}, {"loss": 1.7381, "grad_norm": 0.4885194003582001, "learning_rate": 0.0002, "epoch": 1.4603441040704994, "step": 1740}, {"loss": 1.6661, "grad_norm": 0.35194680094718933, "learning_rate": 0.0002, "epoch": 1.4687368862778012, "step": 1750}, {"loss": 1.7841, "grad_norm": 0.34906691312789917, "learning_rate": 0.0002, "epoch": 1.4771296684851027, "step": 1760}, {"loss": 1.7196, "grad_norm": 0.3994184732437134, "learning_rate": 0.0002, "epoch": 1.4855224506924045, "step": 1770}, {"loss": 1.7157, "grad_norm": 0.3599298298358917, "learning_rate": 0.0002, "epoch": 1.4939152328997063, "step": 1780}, {"loss": 1.6966, "grad_norm": 0.3794984221458435, "learning_rate": 0.0002, "epoch": 1.5023080151070078, "step": 1790}, {"loss": 1.7187, "grad_norm": 0.36289724707603455, "learning_rate": 0.0002, "epoch": 1.5107007973143096, "step": 1800}, {"loss": 1.78, "grad_norm": 0.38057321310043335, "learning_rate": 0.0002, "epoch": 1.5190935795216114, "step": 1810}, {"loss": 1.7006, "grad_norm": 0.3771969676017761, "learning_rate": 0.0002, "epoch": 1.5274863617289132, "step": 1820}, {"loss": 1.765, "grad_norm": 0.34788841009140015, "learning_rate": 0.0002, "epoch": 1.535879143936215, "step": 1830}, {"loss": 1.7148, "grad_norm": 0.41352227330207825, "learning_rate": 0.0002, "epoch": 1.5442719261435167, "step": 1840}, {"loss": 1.6654, "grad_norm": 0.35711410641670227, "learning_rate": 0.0002, "epoch": 1.5526647083508183, "step": 1850}, {"loss": 1.6998, "grad_norm": 0.40607622265815735, "learning_rate": 0.0002, "epoch": 1.56105749055812, "step": 1860}, {"loss": 1.713, "grad_norm": 0.3428550660610199, "learning_rate": 0.0002, "epoch": 1.5694502727654216, "step": 1870}, {"loss": 1.7909, "grad_norm": 0.3695414066314697, "learning_rate": 0.0002, "epoch": 1.5778430549727234, "step": 1880}, {"loss": 1.6629, "grad_norm": 0.3798272907733917, "learning_rate": 0.0002, "epoch": 1.5862358371800251, "step": 1890}, {"loss": 1.7412, "grad_norm": 0.3415829837322235, "learning_rate": 0.0002, "epoch": 1.594628619387327, "step": 1900}, {"loss": 1.8233, "grad_norm": 0.3575693666934967, "learning_rate": 0.0002, "epoch": 1.6030214015946287, "step": 1910}, {"loss": 1.6947, "grad_norm": 0.3180370628833771, "learning_rate": 0.0002, "epoch": 1.6114141838019305, "step": 1920}, {"loss": 1.7506, "grad_norm": 0.5018689036369324, "learning_rate": 0.0002, "epoch": 1.619806966009232, "step": 1930}, {"loss": 1.7368, "grad_norm": 0.35676372051239014, "learning_rate": 0.0002, "epoch": 1.6281997482165338, "step": 1940}, {"loss": 1.7159, "grad_norm": 0.3740452229976654, "learning_rate": 0.0002, "epoch": 1.6365925304238353, "step": 1950}, {"loss": 1.6474, "grad_norm": 0.36584731936454773, "learning_rate": 0.0002, "epoch": 1.6449853126311371, "step": 1960}, {"loss": 1.7306, "grad_norm": 0.38556376099586487, "learning_rate": 0.0002, "epoch": 1.653378094838439, "step": 1970}, {"loss": 1.7694, "grad_norm": 0.4114968776702881, "learning_rate": 0.0002, "epoch": 1.6617708770457407, "step": 1980}, {"loss": 1.6407, "grad_norm": 0.3665498197078705, "learning_rate": 0.0002, "epoch": 1.6701636592530424, "step": 1990}, {"loss": 1.7167, "grad_norm": 0.36579379439353943, "learning_rate": 0.0002, "epoch": 1.6785564414603442, "step": 2000}, {"loss": 1.7637, "grad_norm": 0.3813064694404602, "learning_rate": 0.0002, "epoch": 1.6869492236676458, "step": 2010}, {"loss": 1.7566, "grad_norm": 0.33390694856643677, "learning_rate": 0.0002, "epoch": 1.6953420058749475, "step": 2020}, {"loss": 1.6576, "grad_norm": 0.3668614327907562, "learning_rate": 0.0002, "epoch": 1.7037347880822493, "step": 2030}, {"loss": 1.7162, "grad_norm": 0.352028489112854, "learning_rate": 0.0002, "epoch": 1.7121275702895509, "step": 2040}, {"loss": 1.727, "grad_norm": 0.33639830350875854, "learning_rate": 0.0002, "epoch": 1.7205203524968526, "step": 2050}, {"loss": 1.7868, "grad_norm": 0.39217695593833923, "learning_rate": 0.0002, "epoch": 1.7289131347041544, "step": 2060}, {"loss": 1.7608, "grad_norm": 0.42593324184417725, "learning_rate": 0.0002, "epoch": 1.7373059169114562, "step": 2070}, {"loss": 1.722, "grad_norm": 0.362215518951416, "learning_rate": 0.0002, "epoch": 1.745698699118758, "step": 2080}, {"loss": 1.7712, "grad_norm": 0.4087955057621002, "learning_rate": 0.0002, "epoch": 1.7540914813260597, "step": 2090}, {"loss": 1.6414, "grad_norm": 0.35127750039100647, "learning_rate": 0.0002, "epoch": 1.7624842635333613, "step": 2100}, {"loss": 1.7405, "grad_norm": 0.33677494525909424, "learning_rate": 0.0002, "epoch": 1.770877045740663, "step": 2110}, {"loss": 1.7478, "grad_norm": 0.39616644382476807, "learning_rate": 0.0002, "epoch": 1.7792698279479646, "step": 2120}, {"loss": 1.8068, "grad_norm": 0.4705100953578949, "learning_rate": 0.0002, "epoch": 1.7876626101552664, "step": 2130}, {"loss": 1.75, "grad_norm": 0.3893914818763733, "learning_rate": 0.0002, "epoch": 1.7960553923625682, "step": 2140}, {"loss": 1.6711, "grad_norm": 0.3344813585281372, "learning_rate": 0.0002, "epoch": 1.80444817456987, "step": 2150}, {"loss": 1.8329, "grad_norm": 0.36502110958099365, "learning_rate": 0.0002, "epoch": 1.8128409567771717, "step": 2160}, {"loss": 1.753, "grad_norm": 0.3422985374927521, "learning_rate": 0.0002, "epoch": 1.8212337389844735, "step": 2170}, {"loss": 1.6874, "grad_norm": 0.44039851427078247, "learning_rate": 0.0002, "epoch": 1.829626521191775, "step": 2180}, {"loss": 1.7706, "grad_norm": 0.40052926540374756, "learning_rate": 0.0002, "epoch": 1.8380193033990768, "step": 2190}, {"loss": 1.7551, "grad_norm": 0.3614487648010254, "learning_rate": 0.0002, "epoch": 1.8464120856063784, "step": 2200}, {"loss": 1.6879, "grad_norm": 0.3800305426120758, "learning_rate": 0.0002, "epoch": 1.8548048678136801, "step": 2210}, {"loss": 1.7731, "grad_norm": 0.3942040205001831, "learning_rate": 0.0002, "epoch": 1.863197650020982, "step": 2220}, {"loss": 1.7187, "grad_norm": 0.36896875500679016, "learning_rate": 0.0002, "epoch": 1.8715904322282837, "step": 2230}, {"loss": 1.7371, "grad_norm": 0.3666089177131653, "learning_rate": 0.0002, "epoch": 1.8799832144355855, "step": 2240}, {"loss": 1.7336, "grad_norm": 0.3759142756462097, "learning_rate": 0.0002, "epoch": 1.8883759966428872, "step": 2250}, {"loss": 1.7243, "grad_norm": 0.3711695671081543, "learning_rate": 0.0002, "epoch": 1.8967687788501888, "step": 2260}, {"loss": 1.7052, "grad_norm": 0.37000006437301636, "learning_rate": 0.0002, "epoch": 1.9051615610574906, "step": 2270}, {"loss": 1.7104, "grad_norm": 0.37376025319099426, "learning_rate": 0.0002, "epoch": 1.9135543432647921, "step": 2280}, {"loss": 1.6641, "grad_norm": 0.3794068694114685, "learning_rate": 0.0002, "epoch": 1.921947125472094, "step": 2290}, {"loss": 1.7693, "grad_norm": 0.42530709505081177, "learning_rate": 0.0002, "epoch": 1.9303399076793957, "step": 2300}, {"loss": 1.7871, "grad_norm": 0.3381672203540802, "learning_rate": 0.0002, "epoch": 1.9387326898866974, "step": 2310}, {"loss": 1.7502, "grad_norm": 0.3553236722946167, "learning_rate": 0.0002, "epoch": 1.9471254720939992, "step": 2320}, {"loss": 1.715, "grad_norm": 0.38204774260520935, "learning_rate": 0.0002, "epoch": 1.955518254301301, "step": 2330}, {"loss": 1.7088, "grad_norm": 0.4318946301937103, "learning_rate": 0.0002, "epoch": 1.9639110365086025, "step": 2340}, {"loss": 1.7709, "grad_norm": 0.3563119173049927, "learning_rate": 0.0002, "epoch": 1.9723038187159043, "step": 2350}, {"loss": 1.7083, "grad_norm": 0.362532377243042, "learning_rate": 0.0002, "epoch": 1.980696600923206, "step": 2360}, {"loss": 1.6992, "grad_norm": 0.40200483798980713, "learning_rate": 0.0002, "epoch": 1.9890893831305076, "step": 2370}, {"loss": 1.7622, "grad_norm": 0.37397003173828125, "learning_rate": 0.0002, "epoch": 1.9974821653378094, "step": 2380}, {"eval_loss": 1.807437539100647, "eval_runtime": 38.0038, "eval_samples_per_second": 13.551, "eval_steps_per_second": 1.71, "epoch": 2.0, "step": 2383}, {"loss": 1.579, "grad_norm": 0.3563518226146698, "learning_rate": 0.0002, "epoch": 2.005874947545111, "step": 2390}, {"loss": 1.5467, "grad_norm": 0.3913732171058655, "learning_rate": 0.0002, "epoch": 2.014267729752413, "step": 2400}, {"loss": 1.6202, "grad_norm": 0.3511047661304474, "learning_rate": 0.0002, "epoch": 2.0226605119597147, "step": 2410}, {"loss": 1.599, "grad_norm": 0.3917897641658783, "learning_rate": 0.0002, "epoch": 2.0310532941670165, "step": 2420}, {"loss": 1.663, "grad_norm": 0.36766913533210754, "learning_rate": 0.0002, "epoch": 2.0394460763743183, "step": 2430}, {"loss": 1.5608, "grad_norm": 0.434097021818161, "learning_rate": 0.0002, "epoch": 2.0478388585816196, "step": 2440}, {"loss": 1.6199, "grad_norm": 0.4986756145954132, "learning_rate": 0.0002, "epoch": 2.0562316407889214, "step": 2450}, {"loss": 1.6224, "grad_norm": 0.4377020001411438, "learning_rate": 0.0002, "epoch": 2.064624422996223, "step": 2460}, {"loss": 1.6047, "grad_norm": 0.4412095546722412, "learning_rate": 0.0002, "epoch": 2.073017205203525, "step": 2470}, {"loss": 1.6766, "grad_norm": 0.4463737905025482, "learning_rate": 0.0002, "epoch": 2.0814099874108267, "step": 2480}, {"loss": 1.6666, "grad_norm": 0.4118853211402893, "learning_rate": 0.0002, "epoch": 2.0898027696181285, "step": 2490}, {"loss": 1.6384, "grad_norm": 0.48814308643341064, "learning_rate": 0.0002, "epoch": 2.0981955518254303, "step": 2500}, {"loss": 1.6292, "grad_norm": 0.4263038635253906, "learning_rate": 0.0002, "epoch": 2.106588334032732, "step": 2510}, {"loss": 1.5907, "grad_norm": 0.41060999035835266, "learning_rate": 0.0002, "epoch": 2.1149811162400334, "step": 2520}, {"loss": 1.685, "grad_norm": 0.4699285626411438, "learning_rate": 0.0002, "epoch": 2.123373898447335, "step": 2530}, {"loss": 1.6076, "grad_norm": 0.4321298897266388, "learning_rate": 0.0002, "epoch": 2.131766680654637, "step": 2540}, {"loss": 1.5715, "grad_norm": 0.41544368863105774, "learning_rate": 0.0002, "epoch": 2.1401594628619387, "step": 2550}, {"loss": 1.6717, "grad_norm": 0.4529191851615906, "learning_rate": 0.0002, "epoch": 2.1485522450692405, "step": 2560}, {"loss": 1.7014, "grad_norm": 0.4370215833187103, "learning_rate": 0.0002, "epoch": 2.1569450272765422, "step": 2570}, {"loss": 1.55, "grad_norm": 0.3878629207611084, "learning_rate": 0.0002, "epoch": 2.165337809483844, "step": 2580}, {"loss": 1.6863, "grad_norm": 0.47374191880226135, "learning_rate": 0.0002, "epoch": 2.173730591691146, "step": 2590}, {"loss": 1.6462, "grad_norm": 0.4551556706428528, "learning_rate": 0.0002, "epoch": 2.182123373898447, "step": 2600}, {"loss": 1.6238, "grad_norm": 0.45371633768081665, "learning_rate": 0.0002, "epoch": 2.190516156105749, "step": 2610}, {"loss": 1.6134, "grad_norm": 0.3831859529018402, "learning_rate": 0.0002, "epoch": 2.1989089383130507, "step": 2620}, {"loss": 1.6477, "grad_norm": 0.42436569929122925, "learning_rate": 0.0002, "epoch": 2.2073017205203525, "step": 2630}, {"loss": 1.6512, "grad_norm": 0.4363750219345093, "learning_rate": 0.0002, "epoch": 2.2156945027276542, "step": 2640}, {"loss": 1.6978, "grad_norm": 0.4473390579223633, "learning_rate": 0.0002, "epoch": 2.224087284934956, "step": 2650}, {"loss": 1.6161, "grad_norm": 0.4419533908367157, "learning_rate": 0.0002, "epoch": 2.2324800671422578, "step": 2660}, {"loss": 1.6415, "grad_norm": 0.525901198387146, "learning_rate": 0.0002, "epoch": 2.2408728493495595, "step": 2670}, {"loss": 1.6891, "grad_norm": 0.4345211684703827, "learning_rate": 0.0002, "epoch": 2.2492656315568613, "step": 2680}, {"loss": 1.5951, "grad_norm": 0.5169841051101685, "learning_rate": 0.0002, "epoch": 2.2576584137641627, "step": 2690}, {"loss": 1.6221, "grad_norm": 0.43511003255844116, "learning_rate": 0.0002, "epoch": 2.2660511959714644, "step": 2700}, {"loss": 1.6084, "grad_norm": 0.4781411588191986, "learning_rate": 0.0002, "epoch": 2.274443978178766, "step": 2710}, {"loss": 1.6292, "grad_norm": 0.4282242953777313, "learning_rate": 0.0002, "epoch": 2.282836760386068, "step": 2720}, {"loss": 1.5238, "grad_norm": 0.4499875605106354, "learning_rate": 0.0002, "epoch": 2.2912295425933698, "step": 2730}, {"loss": 1.5844, "grad_norm": 0.4133218824863434, "learning_rate": 0.0002, "epoch": 2.2996223248006715, "step": 2740}, {"loss": 1.6207, "grad_norm": 0.4706156849861145, "learning_rate": 0.0002, "epoch": 2.3080151070079733, "step": 2750}, {"loss": 1.573, "grad_norm": 0.4537484347820282, "learning_rate": 0.0002, "epoch": 2.3164078892152746, "step": 2760}, {"loss": 1.6556, "grad_norm": 0.39736735820770264, "learning_rate": 0.0002, "epoch": 2.3248006714225764, "step": 2770}, {"loss": 1.7032, "grad_norm": 0.4488453269004822, "learning_rate": 0.0002, "epoch": 2.333193453629878, "step": 2780}, {"loss": 1.6169, "grad_norm": 0.44405487179756165, "learning_rate": 0.0002, "epoch": 2.34158623583718, "step": 2790}, {"loss": 1.5207, "grad_norm": 0.4726555049419403, "learning_rate": 0.0002, "epoch": 2.3499790180444817, "step": 2800}, {"loss": 1.5792, "grad_norm": 0.4820375442504883, "learning_rate": 0.0002, "epoch": 2.3583718002517835, "step": 2810}, {"loss": 1.5774, "grad_norm": 0.46176597476005554, "learning_rate": 0.0002, "epoch": 2.3667645824590853, "step": 2820}, {"loss": 1.6256, "grad_norm": 0.4603394567966461, "learning_rate": 0.0002, "epoch": 2.375157364666387, "step": 2830}, {"loss": 1.6598, "grad_norm": 0.4462946355342865, "learning_rate": 0.0002, "epoch": 2.383550146873689, "step": 2840}, {"loss": 1.5939, "grad_norm": 0.5216080546379089, "learning_rate": 0.0002, "epoch": 2.39194292908099, "step": 2850}, {"loss": 1.5981, "grad_norm": 0.44553086161613464, "learning_rate": 0.0002, "epoch": 2.400335711288292, "step": 2860}, {"loss": 1.6556, "grad_norm": 0.4215725362300873, "learning_rate": 0.0002, "epoch": 2.4087284934955937, "step": 2870}, {"loss": 1.6228, "grad_norm": 0.4646450877189636, "learning_rate": 0.0002, "epoch": 2.4171212757028955, "step": 2880}, {"loss": 1.6547, "grad_norm": 0.44749370217323303, "learning_rate": 0.0002, "epoch": 2.4255140579101973, "step": 2890}, {"loss": 1.6356, "grad_norm": 0.4986693859100342, "learning_rate": 0.0002, "epoch": 2.433906840117499, "step": 2900}, {"loss": 1.6294, "grad_norm": 0.4607609808444977, "learning_rate": 0.0002, "epoch": 2.442299622324801, "step": 2910}, {"loss": 1.6721, "grad_norm": 0.4597654938697815, "learning_rate": 0.0002, "epoch": 2.4506924045321026, "step": 2920}, {"loss": 1.7428, "grad_norm": 0.4106820821762085, "learning_rate": 0.0002, "epoch": 2.4590851867394043, "step": 2930}, {"loss": 1.622, "grad_norm": 0.4531514048576355, "learning_rate": 0.0002, "epoch": 2.4674779689467057, "step": 2940}, {"loss": 1.6367, "grad_norm": 0.4546769857406616, "learning_rate": 0.0002, "epoch": 2.4758707511540075, "step": 2950}, {"loss": 1.6306, "grad_norm": 0.47410622239112854, "learning_rate": 0.0002, "epoch": 2.4842635333613092, "step": 2960}, {"loss": 1.6597, "grad_norm": 0.4498177468776703, "learning_rate": 0.0002, "epoch": 2.492656315568611, "step": 2970}, {"loss": 1.6845, "grad_norm": 0.47267791628837585, "learning_rate": 0.0002, "epoch": 2.5010490977759128, "step": 2980}, {"loss": 1.601, "grad_norm": 0.4340207576751709, "learning_rate": 0.0002, "epoch": 2.5094418799832146, "step": 2990}, {"loss": 1.5783, "grad_norm": 0.43454936146736145, "learning_rate": 0.0002, "epoch": 2.5178346621905163, "step": 3000}, {"loss": 1.5773, "grad_norm": 0.43459394574165344, "learning_rate": 0.0002, "epoch": 2.5262274443978177, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4716770052909851, "learning_rate": 0.0002, "epoch": 2.5346202266051194, "step": 3020}, {"loss": 1.626, "grad_norm": 0.4339194595813751, "learning_rate": 0.0002, "epoch": 2.543013008812421, "step": 3030}, {"loss": 1.6053, "grad_norm": 0.4655593931674957, "learning_rate": 0.0002, "epoch": 2.551405791019723, "step": 3040}, {"loss": 1.5871, "grad_norm": 0.5480475425720215, "learning_rate": 0.0002, "epoch": 2.5597985732270248, "step": 3050}, {"loss": 1.7056, "grad_norm": 0.4783174991607666, "learning_rate": 0.0002, "epoch": 2.5681913554343265, "step": 3060}, {"loss": 1.5691, "grad_norm": 0.45062026381492615, "learning_rate": 0.0002, "epoch": 2.5765841376416283, "step": 3070}, {"loss": 1.7005, "grad_norm": 0.4559392035007477, "learning_rate": 0.0002, "epoch": 2.58497691984893, "step": 3080}, {"loss": 1.6414, "grad_norm": 0.6581618785858154, "learning_rate": 0.0002, "epoch": 2.593369702056232, "step": 3090}, {"loss": 1.6707, "grad_norm": 0.48549333214759827, "learning_rate": 0.0002, "epoch": 2.601762484263533, "step": 3100}, {"loss": 1.6128, "grad_norm": 0.5358436107635498, "learning_rate": 0.0002, "epoch": 2.610155266470835, "step": 3110}, {"loss": 1.6507, "grad_norm": 0.5380043983459473, "learning_rate": 0.0002, "epoch": 2.6185480486781367, "step": 3120}, {"loss": 1.6394, "grad_norm": 0.49887847900390625, "learning_rate": 0.0002, "epoch": 2.6269408308854385, "step": 3130}, {"loss": 1.6464, "grad_norm": 0.46039602160453796, "learning_rate": 0.0002, "epoch": 2.6353336130927403, "step": 3140}, {"loss": 1.6337, "grad_norm": 0.416098952293396, "learning_rate": 0.0002, "epoch": 2.643726395300042, "step": 3150}, {"loss": 1.6295, "grad_norm": 0.465326726436615, "learning_rate": 0.0002, "epoch": 2.652119177507344, "step": 3160}, {"loss": 1.5806, "grad_norm": 0.47029924392700195, "learning_rate": 0.0002, "epoch": 2.660511959714645, "step": 3170}, {"loss": 1.6268, "grad_norm": 0.5063307285308838, "learning_rate": 0.0002, "epoch": 2.6689047419219474, "step": 3180}, {"loss": 1.5718, "grad_norm": 0.42928868532180786, "learning_rate": 0.0002, "epoch": 2.6772975241292487, "step": 3190}, {"loss": 1.6113, "grad_norm": 0.4170134365558624, "learning_rate": 0.0002, "epoch": 2.6856903063365505, "step": 3200}, {"loss": 1.6337, "grad_norm": 0.47810474038124084, "learning_rate": 0.0002, "epoch": 2.6940830885438523, "step": 3210}, {"loss": 1.6808, "grad_norm": 0.44440609216690063, "learning_rate": 0.0002, "epoch": 2.702475870751154, "step": 3220}, {"loss": 1.5611, "grad_norm": 0.482759565114975, "learning_rate": 0.0002, "epoch": 2.710868652958456, "step": 3230}, {"loss": 1.6265, "grad_norm": 0.4325942099094391, "learning_rate": 0.0002, "epoch": 2.7192614351657576, "step": 3240}, {"loss": 1.585, "grad_norm": 0.502498984336853, "learning_rate": 0.0002, "epoch": 2.7276542173730594, "step": 3250}, {"loss": 1.7179, "grad_norm": 0.4725162982940674, "learning_rate": 0.0002, "epoch": 2.7360469995803607, "step": 3260}, {"loss": 1.6591, "grad_norm": 0.46781349182128906, "learning_rate": 0.0002, "epoch": 2.7444397817876625, "step": 3270}, {"loss": 1.6625, "grad_norm": 0.47366851568222046, "learning_rate": 0.0002, "epoch": 2.7528325639949642, "step": 3280}, {"loss": 1.6437, "grad_norm": 0.5101882815361023, "learning_rate": 0.0002, "epoch": 2.761225346202266, "step": 3290}, {"loss": 1.6488, "grad_norm": 0.4874587059020996, "learning_rate": 0.0002, "epoch": 2.769618128409568, "step": 3300}, {"loss": 1.6151, "grad_norm": 0.4989369213581085, "learning_rate": 0.0002, "epoch": 2.7780109106168696, "step": 3310}, {"loss": 1.6786, "grad_norm": 0.48041442036628723, "learning_rate": 0.0002, "epoch": 2.7864036928241713, "step": 3320}, {"loss": 1.6137, "grad_norm": 0.4845651090145111, "learning_rate": 0.0002, "epoch": 2.7947964750314727, "step": 3330}, {"loss": 1.7154, "grad_norm": 0.48575496673583984, "learning_rate": 0.0002, "epoch": 2.803189257238775, "step": 3340}, {"loss": 1.6771, "grad_norm": 0.509726881980896, "learning_rate": 0.0002, "epoch": 2.811582039446076, "step": 3350}, {"loss": 1.6937, "grad_norm": 0.5026665329933167, "learning_rate": 0.0002, "epoch": 2.819974821653378, "step": 3360}, {"loss": 1.623, "grad_norm": 0.4727601706981659, "learning_rate": 0.0002, "epoch": 2.8283676038606798, "step": 3370}, {"loss": 1.6811, "grad_norm": 0.41952234506607056, "learning_rate": 0.0002, "epoch": 2.8367603860679815, "step": 3380}, {"loss": 1.6639, "grad_norm": 0.49663856625556946, "learning_rate": 0.0002, "epoch": 2.8451531682752833, "step": 3390}, {"loss": 1.6389, "grad_norm": 0.4934511184692383, "learning_rate": 0.0002, "epoch": 2.853545950482585, "step": 3400}, {"loss": 1.6362, "grad_norm": 0.4673226773738861, "learning_rate": 0.0002, "epoch": 2.861938732689887, "step": 3410}, {"loss": 1.641, "grad_norm": 0.48972779512405396, "learning_rate": 0.0002, "epoch": 2.870331514897188, "step": 3420}, {"loss": 1.6047, "grad_norm": 0.5008330345153809, "learning_rate": 0.0002, "epoch": 2.8787242971044904, "step": 3430}, {"loss": 1.6867, "grad_norm": 0.43337664008140564, "learning_rate": 0.0002, "epoch": 2.8871170793117917, "step": 3440}, {"loss": 1.5501, "grad_norm": 0.4430622458457947, "learning_rate": 0.0002, "epoch": 2.8955098615190935, "step": 3450}, {"loss": 1.6415, "grad_norm": 0.45123326778411865, "learning_rate": 0.0002, "epoch": 2.9039026437263953, "step": 3460}, {"loss": 1.5913, "grad_norm": 0.47367340326309204, "learning_rate": 0.0002, "epoch": 2.912295425933697, "step": 3470}, {"loss": 1.5951, "grad_norm": 0.44940701127052307, "learning_rate": 0.0002, "epoch": 2.920688208140999, "step": 3480}, {"loss": 1.6343, "grad_norm": 0.44216281175613403, "learning_rate": 0.0002, "epoch": 2.9290809903483006, "step": 3490}, {"loss": 1.6088, "grad_norm": 0.4824782609939575, "learning_rate": 0.0002, "epoch": 2.9374737725556024, "step": 3500}, {"loss": 1.5949, "grad_norm": 0.43067067861557007, "learning_rate": 0.0002, "epoch": 2.9458665547629037, "step": 3510}, {"loss": 1.547, "grad_norm": 0.46483176946640015, "learning_rate": 0.0002, "epoch": 2.9542593369702055, "step": 3520}, {"loss": 1.5878, "grad_norm": 0.49230799078941345, "learning_rate": 0.0002, "epoch": 2.9626521191775073, "step": 3530}, {"loss": 1.5925, "grad_norm": 0.5081011652946472, "learning_rate": 0.0002, "epoch": 2.971044901384809, "step": 3540}, {"loss": 1.7402, "grad_norm": 0.5326072573661804, "learning_rate": 0.0002, "epoch": 2.979437683592111, "step": 3550}, {"loss": 1.5769, "grad_norm": 0.4981454014778137, "learning_rate": 0.0002, "epoch": 2.9878304657994126, "step": 3560}, {"loss": 1.6073, "grad_norm": 0.4330528676509857, "learning_rate": 0.0002, "epoch": 2.9962232480067144, "step": 3570}]} +{"epoch": 4.0, "step": 4766, "epoch_duration": 1290.4398715496063, "total_accumulated_duration": 5192.044531583786, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}, {"eval_loss": 1.8086129426956177, "eval_runtime": 38.0431, "eval_samples_per_second": 13.537, "eval_steps_per_second": 1.709, "epoch": 0.9995803608896349, "step": 1191}, {"loss": 1.6856, "grad_norm": 0.3235187232494354, "learning_rate": 0.0002, "epoch": 1.0071338648762065, "step": 1200}, {"loss": 1.7121, "grad_norm": 0.34884774684906006, "learning_rate": 0.0002, "epoch": 1.0155266470835083, "step": 1210}, {"loss": 1.6779, "grad_norm": 0.3215438425540924, "learning_rate": 0.0002, "epoch": 1.0239194292908098, "step": 1220}, {"loss": 1.6562, "grad_norm": 0.312084823846817, "learning_rate": 0.0002, "epoch": 1.0323122114981116, "step": 1230}, {"loss": 1.7366, "grad_norm": 0.33597758412361145, "learning_rate": 0.0002, "epoch": 1.0407049937054134, "step": 1240}, {"loss": 1.7245, "grad_norm": 0.3421499729156494, "learning_rate": 0.0002, "epoch": 1.0490977759127151, "step": 1250}, {"loss": 1.7331, "grad_norm": 0.3458889126777649, "learning_rate": 0.0002, "epoch": 1.0574905581200167, "step": 1260}, {"loss": 1.6929, "grad_norm": 0.3956579864025116, "learning_rate": 0.0002, "epoch": 1.0658833403273185, "step": 1270}, {"loss": 1.6625, "grad_norm": 0.3217819035053253, "learning_rate": 0.0002, "epoch": 1.0742761225346202, "step": 1280}, {"loss": 1.7488, "grad_norm": 0.31379663944244385, "learning_rate": 0.0002, "epoch": 1.082668904741922, "step": 1290}, {"loss": 1.6331, "grad_norm": 0.37231558561325073, "learning_rate": 0.0002, "epoch": 1.0910616869492236, "step": 1300}, {"loss": 1.6614, "grad_norm": 0.35857918858528137, "learning_rate": 0.0002, "epoch": 1.0994544691565253, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.36637991666793823, "learning_rate": 0.0002, "epoch": 1.1078472513638271, "step": 1320}, {"loss": 1.7245, "grad_norm": 0.3436494469642639, "learning_rate": 0.0002, "epoch": 1.1162400335711289, "step": 1330}, {"loss": 1.6867, "grad_norm": 0.404908150434494, "learning_rate": 0.0002, "epoch": 1.1246328157784307, "step": 1340}, {"loss": 1.7042, "grad_norm": 0.34587544202804565, "learning_rate": 0.0002, "epoch": 1.1330255979857322, "step": 1350}, {"loss": 1.6365, "grad_norm": 0.35142362117767334, "learning_rate": 0.0002, "epoch": 1.141418380193034, "step": 1360}, {"loss": 1.6781, "grad_norm": 0.3511804938316345, "learning_rate": 0.0002, "epoch": 1.1498111624003358, "step": 1370}, {"loss": 1.6824, "grad_norm": 0.3549560308456421, "learning_rate": 0.0002, "epoch": 1.1582039446076373, "step": 1380}, {"loss": 1.7276, "grad_norm": 0.35797521471977234, "learning_rate": 0.0002, "epoch": 1.166596726814939, "step": 1390}, {"loss": 1.7476, "grad_norm": 0.37255269289016724, "learning_rate": 0.0002, "epoch": 1.1749895090222409, "step": 1400}, {"loss": 1.7274, "grad_norm": 0.3680652379989624, "learning_rate": 0.0002, "epoch": 1.1833822912295426, "step": 1410}, {"loss": 1.6751, "grad_norm": 0.400831013917923, "learning_rate": 0.0002, "epoch": 1.1917750734368444, "step": 1420}, {"loss": 1.7961, "grad_norm": 0.39571020007133484, "learning_rate": 0.0002, "epoch": 1.200167855644146, "step": 1430}, {"loss": 1.792, "grad_norm": 0.3843863010406494, "learning_rate": 0.0002, "epoch": 1.2085606378514477, "step": 1440}, {"loss": 1.7072, "grad_norm": 0.3901960551738739, "learning_rate": 0.0002, "epoch": 1.2169534200587495, "step": 1450}, {"loss": 1.6425, "grad_norm": 0.36490726470947266, "learning_rate": 0.0002, "epoch": 1.2253462022660513, "step": 1460}, {"loss": 1.6995, "grad_norm": 0.3739864230155945, "learning_rate": 0.0002, "epoch": 1.2337389844733528, "step": 1470}, {"loss": 1.6795, "grad_norm": 0.39061254262924194, "learning_rate": 0.0002, "epoch": 1.2421317666806546, "step": 1480}, {"loss": 1.6838, "grad_norm": 0.37198659777641296, "learning_rate": 0.0002, "epoch": 1.2505245488879564, "step": 1490}, {"loss": 1.725, "grad_norm": 0.3420586884021759, "learning_rate": 0.0002, "epoch": 1.2589173310952582, "step": 1500}, {"loss": 1.719, "grad_norm": 0.4094347655773163, "learning_rate": 0.0002, "epoch": 1.2673101133025597, "step": 1510}, {"loss": 1.7563, "grad_norm": 0.38997703790664673, "learning_rate": 0.0002, "epoch": 1.2757028955098615, "step": 1520}, {"loss": 1.6651, "grad_norm": 0.35702022910118103, "learning_rate": 0.0002, "epoch": 1.2840956777171633, "step": 1530}, {"loss": 1.6689, "grad_norm": 0.3892163336277008, "learning_rate": 0.0002, "epoch": 1.292488459924465, "step": 1540}, {"loss": 1.7209, "grad_norm": 0.33174318075180054, "learning_rate": 0.0002, "epoch": 1.3008812421317666, "step": 1550}, {"loss": 1.7581, "grad_norm": 0.40701809525489807, "learning_rate": 0.0002, "epoch": 1.3092740243390684, "step": 1560}, {"loss": 1.7229, "grad_norm": 0.36324232816696167, "learning_rate": 0.0002, "epoch": 1.3176668065463701, "step": 1570}, {"loss": 1.6708, "grad_norm": 0.3748789429664612, "learning_rate": 0.0002, "epoch": 1.326059588753672, "step": 1580}, {"loss": 1.67, "grad_norm": 0.40873438119888306, "learning_rate": 0.0002, "epoch": 1.3344523709609737, "step": 1590}, {"loss": 1.7909, "grad_norm": 0.52373206615448, "learning_rate": 0.0002, "epoch": 1.3428451531682752, "step": 1600}, {"loss": 1.7593, "grad_norm": 0.40408164262771606, "learning_rate": 0.0002, "epoch": 1.351237935375577, "step": 1610}, {"loss": 1.7959, "grad_norm": 0.3818126320838928, "learning_rate": 0.0002, "epoch": 1.3596307175828788, "step": 1620}, {"loss": 1.6328, "grad_norm": 0.3457068204879761, "learning_rate": 0.0002, "epoch": 1.3680234997901803, "step": 1630}, {"loss": 1.7017, "grad_norm": 0.33777865767478943, "learning_rate": 0.0002, "epoch": 1.3764162819974821, "step": 1640}, {"loss": 1.7335, "grad_norm": 0.36344218254089355, "learning_rate": 0.0002, "epoch": 1.384809064204784, "step": 1650}, {"loss": 1.7656, "grad_norm": 0.3880128562450409, "learning_rate": 0.0002, "epoch": 1.3932018464120857, "step": 1660}, {"loss": 1.7377, "grad_norm": 0.3906225562095642, "learning_rate": 0.0002, "epoch": 1.4015946286193874, "step": 1670}, {"loss": 1.7041, "grad_norm": 0.35857489705085754, "learning_rate": 0.0002, "epoch": 1.409987410826689, "step": 1680}, {"loss": 1.7175, "grad_norm": 0.3627418279647827, "learning_rate": 0.0002, "epoch": 1.4183801930339908, "step": 1690}, {"loss": 1.6948, "grad_norm": 0.41963326930999756, "learning_rate": 0.0002, "epoch": 1.4267729752412925, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.36280378699302673, "learning_rate": 0.0002, "epoch": 1.435165757448594, "step": 1710}, {"loss": 1.7775, "grad_norm": 0.3868233561515808, "learning_rate": 0.0002, "epoch": 1.4435585396558959, "step": 1720}, {"loss": 1.6963, "grad_norm": 0.3635849356651306, "learning_rate": 0.0002, "epoch": 1.4519513218631976, "step": 1730}, {"loss": 1.7381, "grad_norm": 0.4885194003582001, "learning_rate": 0.0002, "epoch": 1.4603441040704994, "step": 1740}, {"loss": 1.6661, "grad_norm": 0.35194680094718933, "learning_rate": 0.0002, "epoch": 1.4687368862778012, "step": 1750}, {"loss": 1.7841, "grad_norm": 0.34906691312789917, "learning_rate": 0.0002, "epoch": 1.4771296684851027, "step": 1760}, {"loss": 1.7196, "grad_norm": 0.3994184732437134, "learning_rate": 0.0002, "epoch": 1.4855224506924045, "step": 1770}, {"loss": 1.7157, "grad_norm": 0.3599298298358917, "learning_rate": 0.0002, "epoch": 1.4939152328997063, "step": 1780}, {"loss": 1.6966, "grad_norm": 0.3794984221458435, "learning_rate": 0.0002, "epoch": 1.5023080151070078, "step": 1790}, {"loss": 1.7187, "grad_norm": 0.36289724707603455, "learning_rate": 0.0002, "epoch": 1.5107007973143096, "step": 1800}, {"loss": 1.78, "grad_norm": 0.38057321310043335, "learning_rate": 0.0002, "epoch": 1.5190935795216114, "step": 1810}, {"loss": 1.7006, "grad_norm": 0.3771969676017761, "learning_rate": 0.0002, "epoch": 1.5274863617289132, "step": 1820}, {"loss": 1.765, "grad_norm": 0.34788841009140015, "learning_rate": 0.0002, "epoch": 1.535879143936215, "step": 1830}, {"loss": 1.7148, "grad_norm": 0.41352227330207825, "learning_rate": 0.0002, "epoch": 1.5442719261435167, "step": 1840}, {"loss": 1.6654, "grad_norm": 0.35711410641670227, "learning_rate": 0.0002, "epoch": 1.5526647083508183, "step": 1850}, {"loss": 1.6998, "grad_norm": 0.40607622265815735, "learning_rate": 0.0002, "epoch": 1.56105749055812, "step": 1860}, {"loss": 1.713, "grad_norm": 0.3428550660610199, "learning_rate": 0.0002, "epoch": 1.5694502727654216, "step": 1870}, {"loss": 1.7909, "grad_norm": 0.3695414066314697, "learning_rate": 0.0002, "epoch": 1.5778430549727234, "step": 1880}, {"loss": 1.6629, "grad_norm": 0.3798272907733917, "learning_rate": 0.0002, "epoch": 1.5862358371800251, "step": 1890}, {"loss": 1.7412, "grad_norm": 0.3415829837322235, "learning_rate": 0.0002, "epoch": 1.594628619387327, "step": 1900}, {"loss": 1.8233, "grad_norm": 0.3575693666934967, "learning_rate": 0.0002, "epoch": 1.6030214015946287, "step": 1910}, {"loss": 1.6947, "grad_norm": 0.3180370628833771, "learning_rate": 0.0002, "epoch": 1.6114141838019305, "step": 1920}, {"loss": 1.7506, "grad_norm": 0.5018689036369324, "learning_rate": 0.0002, "epoch": 1.619806966009232, "step": 1930}, {"loss": 1.7368, "grad_norm": 0.35676372051239014, "learning_rate": 0.0002, "epoch": 1.6281997482165338, "step": 1940}, {"loss": 1.7159, "grad_norm": 0.3740452229976654, "learning_rate": 0.0002, "epoch": 1.6365925304238353, "step": 1950}, {"loss": 1.6474, "grad_norm": 0.36584731936454773, "learning_rate": 0.0002, "epoch": 1.6449853126311371, "step": 1960}, {"loss": 1.7306, "grad_norm": 0.38556376099586487, "learning_rate": 0.0002, "epoch": 1.653378094838439, "step": 1970}, {"loss": 1.7694, "grad_norm": 0.4114968776702881, "learning_rate": 0.0002, "epoch": 1.6617708770457407, "step": 1980}, {"loss": 1.6407, "grad_norm": 0.3665498197078705, "learning_rate": 0.0002, "epoch": 1.6701636592530424, "step": 1990}, {"loss": 1.7167, "grad_norm": 0.36579379439353943, "learning_rate": 0.0002, "epoch": 1.6785564414603442, "step": 2000}, {"loss": 1.7637, "grad_norm": 0.3813064694404602, "learning_rate": 0.0002, "epoch": 1.6869492236676458, "step": 2010}, {"loss": 1.7566, "grad_norm": 0.33390694856643677, "learning_rate": 0.0002, "epoch": 1.6953420058749475, "step": 2020}, {"loss": 1.6576, "grad_norm": 0.3668614327907562, "learning_rate": 0.0002, "epoch": 1.7037347880822493, "step": 2030}, {"loss": 1.7162, "grad_norm": 0.352028489112854, "learning_rate": 0.0002, "epoch": 1.7121275702895509, "step": 2040}, {"loss": 1.727, "grad_norm": 0.33639830350875854, "learning_rate": 0.0002, "epoch": 1.7205203524968526, "step": 2050}, {"loss": 1.7868, "grad_norm": 0.39217695593833923, "learning_rate": 0.0002, "epoch": 1.7289131347041544, "step": 2060}, {"loss": 1.7608, "grad_norm": 0.42593324184417725, "learning_rate": 0.0002, "epoch": 1.7373059169114562, "step": 2070}, {"loss": 1.722, "grad_norm": 0.362215518951416, "learning_rate": 0.0002, "epoch": 1.745698699118758, "step": 2080}, {"loss": 1.7712, "grad_norm": 0.4087955057621002, "learning_rate": 0.0002, "epoch": 1.7540914813260597, "step": 2090}, {"loss": 1.6414, "grad_norm": 0.35127750039100647, "learning_rate": 0.0002, "epoch": 1.7624842635333613, "step": 2100}, {"loss": 1.7405, "grad_norm": 0.33677494525909424, "learning_rate": 0.0002, "epoch": 1.770877045740663, "step": 2110}, {"loss": 1.7478, "grad_norm": 0.39616644382476807, "learning_rate": 0.0002, "epoch": 1.7792698279479646, "step": 2120}, {"loss": 1.8068, "grad_norm": 0.4705100953578949, "learning_rate": 0.0002, "epoch": 1.7876626101552664, "step": 2130}, {"loss": 1.75, "grad_norm": 0.3893914818763733, "learning_rate": 0.0002, "epoch": 1.7960553923625682, "step": 2140}, {"loss": 1.6711, "grad_norm": 0.3344813585281372, "learning_rate": 0.0002, "epoch": 1.80444817456987, "step": 2150}, {"loss": 1.8329, "grad_norm": 0.36502110958099365, "learning_rate": 0.0002, "epoch": 1.8128409567771717, "step": 2160}, {"loss": 1.753, "grad_norm": 0.3422985374927521, "learning_rate": 0.0002, "epoch": 1.8212337389844735, "step": 2170}, {"loss": 1.6874, "grad_norm": 0.44039851427078247, "learning_rate": 0.0002, "epoch": 1.829626521191775, "step": 2180}, {"loss": 1.7706, "grad_norm": 0.40052926540374756, "learning_rate": 0.0002, "epoch": 1.8380193033990768, "step": 2190}, {"loss": 1.7551, "grad_norm": 0.3614487648010254, "learning_rate": 0.0002, "epoch": 1.8464120856063784, "step": 2200}, {"loss": 1.6879, "grad_norm": 0.3800305426120758, "learning_rate": 0.0002, "epoch": 1.8548048678136801, "step": 2210}, {"loss": 1.7731, "grad_norm": 0.3942040205001831, "learning_rate": 0.0002, "epoch": 1.863197650020982, "step": 2220}, {"loss": 1.7187, "grad_norm": 0.36896875500679016, "learning_rate": 0.0002, "epoch": 1.8715904322282837, "step": 2230}, {"loss": 1.7371, "grad_norm": 0.3666089177131653, "learning_rate": 0.0002, "epoch": 1.8799832144355855, "step": 2240}, {"loss": 1.7336, "grad_norm": 0.3759142756462097, "learning_rate": 0.0002, "epoch": 1.8883759966428872, "step": 2250}, {"loss": 1.7243, "grad_norm": 0.3711695671081543, "learning_rate": 0.0002, "epoch": 1.8967687788501888, "step": 2260}, {"loss": 1.7052, "grad_norm": 0.37000006437301636, "learning_rate": 0.0002, "epoch": 1.9051615610574906, "step": 2270}, {"loss": 1.7104, "grad_norm": 0.37376025319099426, "learning_rate": 0.0002, "epoch": 1.9135543432647921, "step": 2280}, {"loss": 1.6641, "grad_norm": 0.3794068694114685, "learning_rate": 0.0002, "epoch": 1.921947125472094, "step": 2290}, {"loss": 1.7693, "grad_norm": 0.42530709505081177, "learning_rate": 0.0002, "epoch": 1.9303399076793957, "step": 2300}, {"loss": 1.7871, "grad_norm": 0.3381672203540802, "learning_rate": 0.0002, "epoch": 1.9387326898866974, "step": 2310}, {"loss": 1.7502, "grad_norm": 0.3553236722946167, "learning_rate": 0.0002, "epoch": 1.9471254720939992, "step": 2320}, {"loss": 1.715, "grad_norm": 0.38204774260520935, "learning_rate": 0.0002, "epoch": 1.955518254301301, "step": 2330}, {"loss": 1.7088, "grad_norm": 0.4318946301937103, "learning_rate": 0.0002, "epoch": 1.9639110365086025, "step": 2340}, {"loss": 1.7709, "grad_norm": 0.3563119173049927, "learning_rate": 0.0002, "epoch": 1.9723038187159043, "step": 2350}, {"loss": 1.7083, "grad_norm": 0.362532377243042, "learning_rate": 0.0002, "epoch": 1.980696600923206, "step": 2360}, {"loss": 1.6992, "grad_norm": 0.40200483798980713, "learning_rate": 0.0002, "epoch": 1.9890893831305076, "step": 2370}, {"loss": 1.7622, "grad_norm": 0.37397003173828125, "learning_rate": 0.0002, "epoch": 1.9974821653378094, "step": 2380}, {"eval_loss": 1.807437539100647, "eval_runtime": 38.0038, "eval_samples_per_second": 13.551, "eval_steps_per_second": 1.71, "epoch": 2.0, "step": 2383}, {"loss": 1.579, "grad_norm": 0.3563518226146698, "learning_rate": 0.0002, "epoch": 2.005874947545111, "step": 2390}, {"loss": 1.5467, "grad_norm": 0.3913732171058655, "learning_rate": 0.0002, "epoch": 2.014267729752413, "step": 2400}, {"loss": 1.6202, "grad_norm": 0.3511047661304474, "learning_rate": 0.0002, "epoch": 2.0226605119597147, "step": 2410}, {"loss": 1.599, "grad_norm": 0.3917897641658783, "learning_rate": 0.0002, "epoch": 2.0310532941670165, "step": 2420}, {"loss": 1.663, "grad_norm": 0.36766913533210754, "learning_rate": 0.0002, "epoch": 2.0394460763743183, "step": 2430}, {"loss": 1.5608, "grad_norm": 0.434097021818161, "learning_rate": 0.0002, "epoch": 2.0478388585816196, "step": 2440}, {"loss": 1.6199, "grad_norm": 0.4986756145954132, "learning_rate": 0.0002, "epoch": 2.0562316407889214, "step": 2450}, {"loss": 1.6224, "grad_norm": 0.4377020001411438, "learning_rate": 0.0002, "epoch": 2.064624422996223, "step": 2460}, {"loss": 1.6047, "grad_norm": 0.4412095546722412, "learning_rate": 0.0002, "epoch": 2.073017205203525, "step": 2470}, {"loss": 1.6766, "grad_norm": 0.4463737905025482, "learning_rate": 0.0002, "epoch": 2.0814099874108267, "step": 2480}, {"loss": 1.6666, "grad_norm": 0.4118853211402893, "learning_rate": 0.0002, "epoch": 2.0898027696181285, "step": 2490}, {"loss": 1.6384, "grad_norm": 0.48814308643341064, "learning_rate": 0.0002, "epoch": 2.0981955518254303, "step": 2500}, {"loss": 1.6292, "grad_norm": 0.4263038635253906, "learning_rate": 0.0002, "epoch": 2.106588334032732, "step": 2510}, {"loss": 1.5907, "grad_norm": 0.41060999035835266, "learning_rate": 0.0002, "epoch": 2.1149811162400334, "step": 2520}, {"loss": 1.685, "grad_norm": 0.4699285626411438, "learning_rate": 0.0002, "epoch": 2.123373898447335, "step": 2530}, {"loss": 1.6076, "grad_norm": 0.4321298897266388, "learning_rate": 0.0002, "epoch": 2.131766680654637, "step": 2540}, {"loss": 1.5715, "grad_norm": 0.41544368863105774, "learning_rate": 0.0002, "epoch": 2.1401594628619387, "step": 2550}, {"loss": 1.6717, "grad_norm": 0.4529191851615906, "learning_rate": 0.0002, "epoch": 2.1485522450692405, "step": 2560}, {"loss": 1.7014, "grad_norm": 0.4370215833187103, "learning_rate": 0.0002, "epoch": 2.1569450272765422, "step": 2570}, {"loss": 1.55, "grad_norm": 0.3878629207611084, "learning_rate": 0.0002, "epoch": 2.165337809483844, "step": 2580}, {"loss": 1.6863, "grad_norm": 0.47374191880226135, "learning_rate": 0.0002, "epoch": 2.173730591691146, "step": 2590}, {"loss": 1.6462, "grad_norm": 0.4551556706428528, "learning_rate": 0.0002, "epoch": 2.182123373898447, "step": 2600}, {"loss": 1.6238, "grad_norm": 0.45371633768081665, "learning_rate": 0.0002, "epoch": 2.190516156105749, "step": 2610}, {"loss": 1.6134, "grad_norm": 0.3831859529018402, "learning_rate": 0.0002, "epoch": 2.1989089383130507, "step": 2620}, {"loss": 1.6477, "grad_norm": 0.42436569929122925, "learning_rate": 0.0002, "epoch": 2.2073017205203525, "step": 2630}, {"loss": 1.6512, "grad_norm": 0.4363750219345093, "learning_rate": 0.0002, "epoch": 2.2156945027276542, "step": 2640}, {"loss": 1.6978, "grad_norm": 0.4473390579223633, "learning_rate": 0.0002, "epoch": 2.224087284934956, "step": 2650}, {"loss": 1.6161, "grad_norm": 0.4419533908367157, "learning_rate": 0.0002, "epoch": 2.2324800671422578, "step": 2660}, {"loss": 1.6415, "grad_norm": 0.525901198387146, "learning_rate": 0.0002, "epoch": 2.2408728493495595, "step": 2670}, {"loss": 1.6891, "grad_norm": 0.4345211684703827, "learning_rate": 0.0002, "epoch": 2.2492656315568613, "step": 2680}, {"loss": 1.5951, "grad_norm": 0.5169841051101685, "learning_rate": 0.0002, "epoch": 2.2576584137641627, "step": 2690}, {"loss": 1.6221, "grad_norm": 0.43511003255844116, "learning_rate": 0.0002, "epoch": 2.2660511959714644, "step": 2700}, {"loss": 1.6084, "grad_norm": 0.4781411588191986, "learning_rate": 0.0002, "epoch": 2.274443978178766, "step": 2710}, {"loss": 1.6292, "grad_norm": 0.4282242953777313, "learning_rate": 0.0002, "epoch": 2.282836760386068, "step": 2720}, {"loss": 1.5238, "grad_norm": 0.4499875605106354, "learning_rate": 0.0002, "epoch": 2.2912295425933698, "step": 2730}, {"loss": 1.5844, "grad_norm": 0.4133218824863434, "learning_rate": 0.0002, "epoch": 2.2996223248006715, "step": 2740}, {"loss": 1.6207, "grad_norm": 0.4706156849861145, "learning_rate": 0.0002, "epoch": 2.3080151070079733, "step": 2750}, {"loss": 1.573, "grad_norm": 0.4537484347820282, "learning_rate": 0.0002, "epoch": 2.3164078892152746, "step": 2760}, {"loss": 1.6556, "grad_norm": 0.39736735820770264, "learning_rate": 0.0002, "epoch": 2.3248006714225764, "step": 2770}, {"loss": 1.7032, "grad_norm": 0.4488453269004822, "learning_rate": 0.0002, "epoch": 2.333193453629878, "step": 2780}, {"loss": 1.6169, "grad_norm": 0.44405487179756165, "learning_rate": 0.0002, "epoch": 2.34158623583718, "step": 2790}, {"loss": 1.5207, "grad_norm": 0.4726555049419403, "learning_rate": 0.0002, "epoch": 2.3499790180444817, "step": 2800}, {"loss": 1.5792, "grad_norm": 0.4820375442504883, "learning_rate": 0.0002, "epoch": 2.3583718002517835, "step": 2810}, {"loss": 1.5774, "grad_norm": 0.46176597476005554, "learning_rate": 0.0002, "epoch": 2.3667645824590853, "step": 2820}, {"loss": 1.6256, "grad_norm": 0.4603394567966461, "learning_rate": 0.0002, "epoch": 2.375157364666387, "step": 2830}, {"loss": 1.6598, "grad_norm": 0.4462946355342865, "learning_rate": 0.0002, "epoch": 2.383550146873689, "step": 2840}, {"loss": 1.5939, "grad_norm": 0.5216080546379089, "learning_rate": 0.0002, "epoch": 2.39194292908099, "step": 2850}, {"loss": 1.5981, "grad_norm": 0.44553086161613464, "learning_rate": 0.0002, "epoch": 2.400335711288292, "step": 2860}, {"loss": 1.6556, "grad_norm": 0.4215725362300873, "learning_rate": 0.0002, "epoch": 2.4087284934955937, "step": 2870}, {"loss": 1.6228, "grad_norm": 0.4646450877189636, "learning_rate": 0.0002, "epoch": 2.4171212757028955, "step": 2880}, {"loss": 1.6547, "grad_norm": 0.44749370217323303, "learning_rate": 0.0002, "epoch": 2.4255140579101973, "step": 2890}, {"loss": 1.6356, "grad_norm": 0.4986693859100342, "learning_rate": 0.0002, "epoch": 2.433906840117499, "step": 2900}, {"loss": 1.6294, "grad_norm": 0.4607609808444977, "learning_rate": 0.0002, "epoch": 2.442299622324801, "step": 2910}, {"loss": 1.6721, "grad_norm": 0.4597654938697815, "learning_rate": 0.0002, "epoch": 2.4506924045321026, "step": 2920}, {"loss": 1.7428, "grad_norm": 0.4106820821762085, "learning_rate": 0.0002, "epoch": 2.4590851867394043, "step": 2930}, {"loss": 1.622, "grad_norm": 0.4531514048576355, "learning_rate": 0.0002, "epoch": 2.4674779689467057, "step": 2940}, {"loss": 1.6367, "grad_norm": 0.4546769857406616, "learning_rate": 0.0002, "epoch": 2.4758707511540075, "step": 2950}, {"loss": 1.6306, "grad_norm": 0.47410622239112854, "learning_rate": 0.0002, "epoch": 2.4842635333613092, "step": 2960}, {"loss": 1.6597, "grad_norm": 0.4498177468776703, "learning_rate": 0.0002, "epoch": 2.492656315568611, "step": 2970}, {"loss": 1.6845, "grad_norm": 0.47267791628837585, "learning_rate": 0.0002, "epoch": 2.5010490977759128, "step": 2980}, {"loss": 1.601, "grad_norm": 0.4340207576751709, "learning_rate": 0.0002, "epoch": 2.5094418799832146, "step": 2990}, {"loss": 1.5783, "grad_norm": 0.43454936146736145, "learning_rate": 0.0002, "epoch": 2.5178346621905163, "step": 3000}, {"loss": 1.5773, "grad_norm": 0.43459394574165344, "learning_rate": 0.0002, "epoch": 2.5262274443978177, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4716770052909851, "learning_rate": 0.0002, "epoch": 2.5346202266051194, "step": 3020}, {"loss": 1.626, "grad_norm": 0.4339194595813751, "learning_rate": 0.0002, "epoch": 2.543013008812421, "step": 3030}, {"loss": 1.6053, "grad_norm": 0.4655593931674957, "learning_rate": 0.0002, "epoch": 2.551405791019723, "step": 3040}, {"loss": 1.5871, "grad_norm": 0.5480475425720215, "learning_rate": 0.0002, "epoch": 2.5597985732270248, "step": 3050}, {"loss": 1.7056, "grad_norm": 0.4783174991607666, "learning_rate": 0.0002, "epoch": 2.5681913554343265, "step": 3060}, {"loss": 1.5691, "grad_norm": 0.45062026381492615, "learning_rate": 0.0002, "epoch": 2.5765841376416283, "step": 3070}, {"loss": 1.7005, "grad_norm": 0.4559392035007477, "learning_rate": 0.0002, "epoch": 2.58497691984893, "step": 3080}, {"loss": 1.6414, "grad_norm": 0.6581618785858154, "learning_rate": 0.0002, "epoch": 2.593369702056232, "step": 3090}, {"loss": 1.6707, "grad_norm": 0.48549333214759827, "learning_rate": 0.0002, "epoch": 2.601762484263533, "step": 3100}, {"loss": 1.6128, "grad_norm": 0.5358436107635498, "learning_rate": 0.0002, "epoch": 2.610155266470835, "step": 3110}, {"loss": 1.6507, "grad_norm": 0.5380043983459473, "learning_rate": 0.0002, "epoch": 2.6185480486781367, "step": 3120}, {"loss": 1.6394, "grad_norm": 0.49887847900390625, "learning_rate": 0.0002, "epoch": 2.6269408308854385, "step": 3130}, {"loss": 1.6464, "grad_norm": 0.46039602160453796, "learning_rate": 0.0002, "epoch": 2.6353336130927403, "step": 3140}, {"loss": 1.6337, "grad_norm": 0.416098952293396, "learning_rate": 0.0002, "epoch": 2.643726395300042, "step": 3150}, {"loss": 1.6295, "grad_norm": 0.465326726436615, "learning_rate": 0.0002, "epoch": 2.652119177507344, "step": 3160}, {"loss": 1.5806, "grad_norm": 0.47029924392700195, "learning_rate": 0.0002, "epoch": 2.660511959714645, "step": 3170}, {"loss": 1.6268, "grad_norm": 0.5063307285308838, "learning_rate": 0.0002, "epoch": 2.6689047419219474, "step": 3180}, {"loss": 1.5718, "grad_norm": 0.42928868532180786, "learning_rate": 0.0002, "epoch": 2.6772975241292487, "step": 3190}, {"loss": 1.6113, "grad_norm": 0.4170134365558624, "learning_rate": 0.0002, "epoch": 2.6856903063365505, "step": 3200}, {"loss": 1.6337, "grad_norm": 0.47810474038124084, "learning_rate": 0.0002, "epoch": 2.6940830885438523, "step": 3210}, {"loss": 1.6808, "grad_norm": 0.44440609216690063, "learning_rate": 0.0002, "epoch": 2.702475870751154, "step": 3220}, {"loss": 1.5611, "grad_norm": 0.482759565114975, "learning_rate": 0.0002, "epoch": 2.710868652958456, "step": 3230}, {"loss": 1.6265, "grad_norm": 0.4325942099094391, "learning_rate": 0.0002, "epoch": 2.7192614351657576, "step": 3240}, {"loss": 1.585, "grad_norm": 0.502498984336853, "learning_rate": 0.0002, "epoch": 2.7276542173730594, "step": 3250}, {"loss": 1.7179, "grad_norm": 0.4725162982940674, "learning_rate": 0.0002, "epoch": 2.7360469995803607, "step": 3260}, {"loss": 1.6591, "grad_norm": 0.46781349182128906, "learning_rate": 0.0002, "epoch": 2.7444397817876625, "step": 3270}, {"loss": 1.6625, "grad_norm": 0.47366851568222046, "learning_rate": 0.0002, "epoch": 2.7528325639949642, "step": 3280}, {"loss": 1.6437, "grad_norm": 0.5101882815361023, "learning_rate": 0.0002, "epoch": 2.761225346202266, "step": 3290}, {"loss": 1.6488, "grad_norm": 0.4874587059020996, "learning_rate": 0.0002, "epoch": 2.769618128409568, "step": 3300}, {"loss": 1.6151, "grad_norm": 0.4989369213581085, "learning_rate": 0.0002, "epoch": 2.7780109106168696, "step": 3310}, {"loss": 1.6786, "grad_norm": 0.48041442036628723, "learning_rate": 0.0002, "epoch": 2.7864036928241713, "step": 3320}, {"loss": 1.6137, "grad_norm": 0.4845651090145111, "learning_rate": 0.0002, "epoch": 2.7947964750314727, "step": 3330}, {"loss": 1.7154, "grad_norm": 0.48575496673583984, "learning_rate": 0.0002, "epoch": 2.803189257238775, "step": 3340}, {"loss": 1.6771, "grad_norm": 0.509726881980896, "learning_rate": 0.0002, "epoch": 2.811582039446076, "step": 3350}, {"loss": 1.6937, "grad_norm": 0.5026665329933167, "learning_rate": 0.0002, "epoch": 2.819974821653378, "step": 3360}, {"loss": 1.623, "grad_norm": 0.4727601706981659, "learning_rate": 0.0002, "epoch": 2.8283676038606798, "step": 3370}, {"loss": 1.6811, "grad_norm": 0.41952234506607056, "learning_rate": 0.0002, "epoch": 2.8367603860679815, "step": 3380}, {"loss": 1.6639, "grad_norm": 0.49663856625556946, "learning_rate": 0.0002, "epoch": 2.8451531682752833, "step": 3390}, {"loss": 1.6389, "grad_norm": 0.4934511184692383, "learning_rate": 0.0002, "epoch": 2.853545950482585, "step": 3400}, {"loss": 1.6362, "grad_norm": 0.4673226773738861, "learning_rate": 0.0002, "epoch": 2.861938732689887, "step": 3410}, {"loss": 1.641, "grad_norm": 0.48972779512405396, "learning_rate": 0.0002, "epoch": 2.870331514897188, "step": 3420}, {"loss": 1.6047, "grad_norm": 0.5008330345153809, "learning_rate": 0.0002, "epoch": 2.8787242971044904, "step": 3430}, {"loss": 1.6867, "grad_norm": 0.43337664008140564, "learning_rate": 0.0002, "epoch": 2.8871170793117917, "step": 3440}, {"loss": 1.5501, "grad_norm": 0.4430622458457947, "learning_rate": 0.0002, "epoch": 2.8955098615190935, "step": 3450}, {"loss": 1.6415, "grad_norm": 0.45123326778411865, "learning_rate": 0.0002, "epoch": 2.9039026437263953, "step": 3460}, {"loss": 1.5913, "grad_norm": 0.47367340326309204, "learning_rate": 0.0002, "epoch": 2.912295425933697, "step": 3470}, {"loss": 1.5951, "grad_norm": 0.44940701127052307, "learning_rate": 0.0002, "epoch": 2.920688208140999, "step": 3480}, {"loss": 1.6343, "grad_norm": 0.44216281175613403, "learning_rate": 0.0002, "epoch": 2.9290809903483006, "step": 3490}, {"loss": 1.6088, "grad_norm": 0.4824782609939575, "learning_rate": 0.0002, "epoch": 2.9374737725556024, "step": 3500}, {"loss": 1.5949, "grad_norm": 0.43067067861557007, "learning_rate": 0.0002, "epoch": 2.9458665547629037, "step": 3510}, {"loss": 1.547, "grad_norm": 0.46483176946640015, "learning_rate": 0.0002, "epoch": 2.9542593369702055, "step": 3520}, {"loss": 1.5878, "grad_norm": 0.49230799078941345, "learning_rate": 0.0002, "epoch": 2.9626521191775073, "step": 3530}, {"loss": 1.5925, "grad_norm": 0.5081011652946472, "learning_rate": 0.0002, "epoch": 2.971044901384809, "step": 3540}, {"loss": 1.7402, "grad_norm": 0.5326072573661804, "learning_rate": 0.0002, "epoch": 2.979437683592111, "step": 3550}, {"loss": 1.5769, "grad_norm": 0.4981454014778137, "learning_rate": 0.0002, "epoch": 2.9878304657994126, "step": 3560}, {"loss": 1.6073, "grad_norm": 0.4330528676509857, "learning_rate": 0.0002, "epoch": 2.9962232480067144, "step": 3570}, {"eval_loss": 1.824695348739624, "eval_runtime": 37.947, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.999580360889635, "step": 3574}, {"loss": 1.5633, "grad_norm": 0.4380604326725006, "learning_rate": 0.0002, "epoch": 3.004616030214016, "step": 3580}, {"loss": 1.4474, "grad_norm": 0.5375564098358154, "learning_rate": 0.0002, "epoch": 3.0130088124213175, "step": 3590}, {"loss": 1.5738, "grad_norm": 0.50722736120224, "learning_rate": 0.0002, "epoch": 3.0214015946286192, "step": 3600}, {"loss": 1.5191, "grad_norm": 0.5398766994476318, "learning_rate": 0.0002, "epoch": 3.029794376835921, "step": 3610}, {"loss": 1.4401, "grad_norm": 0.520709753036499, "learning_rate": 0.0002, "epoch": 3.038187159043223, "step": 3620}, {"loss": 1.5704, "grad_norm": 0.5429664850234985, "learning_rate": 0.0002, "epoch": 3.0465799412505246, "step": 3630}, {"loss": 1.5516, "grad_norm": 0.5634943842887878, "learning_rate": 0.0002, "epoch": 3.0549727234578263, "step": 3640}, {"loss": 1.5349, "grad_norm": 0.5042277574539185, "learning_rate": 0.0002, "epoch": 3.063365505665128, "step": 3650}, {"loss": 1.4708, "grad_norm": 0.5778711438179016, "learning_rate": 0.0002, "epoch": 3.07175828787243, "step": 3660}, {"loss": 1.5196, "grad_norm": 0.5504926443099976, "learning_rate": 0.0002, "epoch": 3.080151070079731, "step": 3670}, {"loss": 1.473, "grad_norm": 0.5199463963508606, "learning_rate": 0.0002, "epoch": 3.088543852287033, "step": 3680}, {"loss": 1.5064, "grad_norm": 0.552334189414978, "learning_rate": 0.0002, "epoch": 3.0969366344943348, "step": 3690}, {"loss": 1.4638, "grad_norm": 0.5650873780250549, "learning_rate": 0.0002, "epoch": 3.1053294167016365, "step": 3700}, {"loss": 1.4945, "grad_norm": 0.6292349696159363, "learning_rate": 0.0002, "epoch": 3.1137221989089383, "step": 3710}, {"loss": 1.4787, "grad_norm": 0.5523604154586792, "learning_rate": 0.0002, "epoch": 3.12211498111624, "step": 3720}, {"loss": 1.4697, "grad_norm": 0.6160100698471069, "learning_rate": 0.0002, "epoch": 3.130507763323542, "step": 3730}, {"loss": 1.5589, "grad_norm": 0.6091629266738892, "learning_rate": 0.0002, "epoch": 3.1389005455308436, "step": 3740}, {"loss": 1.4659, "grad_norm": 0.5695531964302063, "learning_rate": 0.0002, "epoch": 3.1472933277381454, "step": 3750}, {"loss": 1.4605, "grad_norm": 0.569611132144928, "learning_rate": 0.0002, "epoch": 3.1556861099454467, "step": 3760}, {"loss": 1.4592, "grad_norm": 0.5761140584945679, "learning_rate": 0.0002, "epoch": 3.1640788921527485, "step": 3770}, {"loss": 1.4999, "grad_norm": 0.6855548620223999, "learning_rate": 0.0002, "epoch": 3.1724716743600503, "step": 3780}, {"loss": 1.5047, "grad_norm": 0.5815101265907288, "learning_rate": 0.0002, "epoch": 3.180864456567352, "step": 3790}, {"loss": 1.5289, "grad_norm": 0.6179960370063782, "learning_rate": 0.0002, "epoch": 3.189257238774654, "step": 3800}, {"loss": 1.4833, "grad_norm": 0.5418674349784851, "learning_rate": 0.0002, "epoch": 3.1976500209819556, "step": 3810}, {"loss": 1.4994, "grad_norm": 0.5655816197395325, "learning_rate": 0.0002, "epoch": 3.2060428031892574, "step": 3820}, {"loss": 1.5007, "grad_norm": 0.7279291152954102, "learning_rate": 0.0002, "epoch": 3.214435585396559, "step": 3830}, {"loss": 1.5672, "grad_norm": 0.490998238325119, "learning_rate": 0.0002, "epoch": 3.2228283676038605, "step": 3840}, {"loss": 1.4683, "grad_norm": 0.6065797209739685, "learning_rate": 0.0002, "epoch": 3.2312211498111623, "step": 3850}, {"loss": 1.5153, "grad_norm": 0.6024682521820068, "learning_rate": 0.0002, "epoch": 3.239613932018464, "step": 3860}, {"loss": 1.5123, "grad_norm": 0.5571125745773315, "learning_rate": 0.0002, "epoch": 3.248006714225766, "step": 3870}, {"loss": 1.4609, "grad_norm": 0.5662134289741516, "learning_rate": 0.0002, "epoch": 3.2563994964330676, "step": 3880}, {"loss": 1.5452, "grad_norm": 0.5936661958694458, "learning_rate": 0.0002, "epoch": 3.2647922786403694, "step": 3890}, {"loss": 1.5149, "grad_norm": 0.6739671230316162, "learning_rate": 0.0002, "epoch": 3.273185060847671, "step": 3900}, {"loss": 1.5101, "grad_norm": 0.5579532384872437, "learning_rate": 0.0002, "epoch": 3.281577843054973, "step": 3910}, {"loss": 1.4788, "grad_norm": 0.6595954298973083, "learning_rate": 0.0002, "epoch": 3.2899706252622742, "step": 3920}, {"loss": 1.473, "grad_norm": 0.5712262988090515, "learning_rate": 0.0002, "epoch": 3.298363407469576, "step": 3930}, {"loss": 1.5512, "grad_norm": 0.5601761341094971, "learning_rate": 0.0002, "epoch": 3.306756189676878, "step": 3940}, {"loss": 1.4904, "grad_norm": 0.5759967565536499, "learning_rate": 0.0002, "epoch": 3.3151489718841796, "step": 3950}, {"loss": 1.4885, "grad_norm": 0.6543047428131104, "learning_rate": 0.0002, "epoch": 3.3235417540914813, "step": 3960}, {"loss": 1.5063, "grad_norm": 0.6355253458023071, "learning_rate": 0.0002, "epoch": 3.331934536298783, "step": 3970}, {"loss": 1.5025, "grad_norm": 0.5671007633209229, "learning_rate": 0.0002, "epoch": 3.340327318506085, "step": 3980}, {"loss": 1.5049, "grad_norm": 0.6743636727333069, "learning_rate": 0.0002, "epoch": 3.3487201007133867, "step": 3990}, {"loss": 1.5527, "grad_norm": 0.500627338886261, "learning_rate": 0.0002, "epoch": 3.3571128829206884, "step": 4000}, {"loss": 1.4884, "grad_norm": 0.5666340589523315, "learning_rate": 0.0002, "epoch": 3.3655056651279898, "step": 4010}, {"loss": 1.5104, "grad_norm": 0.5651408433914185, "learning_rate": 0.0002, "epoch": 3.3738984473352915, "step": 4020}, {"loss": 1.4907, "grad_norm": 0.6338897943496704, "learning_rate": 0.0002, "epoch": 3.3822912295425933, "step": 4030}, {"loss": 1.553, "grad_norm": 0.5781935453414917, "learning_rate": 0.0002, "epoch": 3.390684011749895, "step": 4040}, {"loss": 1.5535, "grad_norm": 0.55543053150177, "learning_rate": 0.0002, "epoch": 3.399076793957197, "step": 4050}, {"loss": 1.4884, "grad_norm": 0.6602614521980286, "learning_rate": 0.0002, "epoch": 3.4074695761644986, "step": 4060}, {"loss": 1.471, "grad_norm": 0.5514156222343445, "learning_rate": 0.0002, "epoch": 3.4158623583718004, "step": 4070}, {"loss": 1.4634, "grad_norm": 0.5760560035705566, "learning_rate": 0.0002, "epoch": 3.4242551405791017, "step": 4080}, {"loss": 1.4662, "grad_norm": 0.657503604888916, "learning_rate": 0.0002, "epoch": 3.4326479227864035, "step": 4090}, {"loss": 1.5041, "grad_norm": 0.5746736526489258, "learning_rate": 0.0002, "epoch": 3.4410407049937053, "step": 4100}, {"loss": 1.4387, "grad_norm": 0.5988999009132385, "learning_rate": 0.0002, "epoch": 3.449433487201007, "step": 4110}, {"loss": 1.5475, "grad_norm": 0.7294586300849915, "learning_rate": 0.0002, "epoch": 3.457826269408309, "step": 4120}, {"loss": 1.4878, "grad_norm": 0.6391161680221558, "learning_rate": 0.0002, "epoch": 3.4662190516156106, "step": 4130}, {"loss": 1.5366, "grad_norm": 0.6416470408439636, "learning_rate": 0.0002, "epoch": 3.4746118338229124, "step": 4140}, {"loss": 1.5587, "grad_norm": 0.5710626244544983, "learning_rate": 0.0002, "epoch": 3.483004616030214, "step": 4150}, {"loss": 1.4661, "grad_norm": 0.5370054841041565, "learning_rate": 0.0002, "epoch": 3.491397398237516, "step": 4160}, {"loss": 1.5167, "grad_norm": 0.5559558272361755, "learning_rate": 0.0002, "epoch": 3.4997901804448173, "step": 4170}, {"loss": 1.4244, "grad_norm": 0.5426168441772461, "learning_rate": 0.0002, "epoch": 3.508182962652119, "step": 4180}, {"loss": 1.5241, "grad_norm": 0.5997438430786133, "learning_rate": 0.0002, "epoch": 3.516575744859421, "step": 4190}, {"loss": 1.6091, "grad_norm": 0.5399143099784851, "learning_rate": 0.0002, "epoch": 3.5249685270667226, "step": 4200}, {"loss": 1.5066, "grad_norm": 0.6341416239738464, "learning_rate": 0.0002, "epoch": 3.5333613092740244, "step": 4210}, {"loss": 1.5436, "grad_norm": 0.632238507270813, "learning_rate": 0.0002, "epoch": 3.541754091481326, "step": 4220}, {"loss": 1.5423, "grad_norm": 0.6356478333473206, "learning_rate": 0.0002, "epoch": 3.550146873688628, "step": 4230}, {"loss": 1.483, "grad_norm": 0.6379408240318298, "learning_rate": 0.0002, "epoch": 3.5585396558959292, "step": 4240}, {"loss": 1.5184, "grad_norm": 0.6265586018562317, "learning_rate": 0.0002, "epoch": 3.5669324381032315, "step": 4250}, {"loss": 1.5047, "grad_norm": 0.5378820896148682, "learning_rate": 0.0002, "epoch": 3.575325220310533, "step": 4260}, {"loss": 1.5668, "grad_norm": 0.6800801753997803, "learning_rate": 0.0002, "epoch": 3.5837180025178346, "step": 4270}, {"loss": 1.5363, "grad_norm": 0.5653113126754761, "learning_rate": 0.0002, "epoch": 3.5921107847251363, "step": 4280}, {"loss": 1.5007, "grad_norm": 0.548647940158844, "learning_rate": 0.0002, "epoch": 3.600503566932438, "step": 4290}, {"loss": 1.5034, "grad_norm": 0.5729944705963135, "learning_rate": 0.0002, "epoch": 3.60889634913974, "step": 4300}, {"loss": 1.575, "grad_norm": 0.6204999685287476, "learning_rate": 0.0002, "epoch": 3.6172891313470417, "step": 4310}, {"loss": 1.5107, "grad_norm": 0.6275812983512878, "learning_rate": 0.0002, "epoch": 3.6256819135543434, "step": 4320}, {"loss": 1.5013, "grad_norm": 0.7261835336685181, "learning_rate": 0.0002, "epoch": 3.6340746957616448, "step": 4330}, {"loss": 1.5128, "grad_norm": 0.6048004627227783, "learning_rate": 0.0002, "epoch": 3.6424674779689465, "step": 4340}, {"loss": 1.5106, "grad_norm": 0.5879671573638916, "learning_rate": 0.0002, "epoch": 3.6508602601762483, "step": 4350}, {"loss": 1.5477, "grad_norm": 0.6001018285751343, "learning_rate": 0.0002, "epoch": 3.65925304238355, "step": 4360}, {"loss": 1.5247, "grad_norm": 0.6468151211738586, "learning_rate": 0.0002, "epoch": 3.667645824590852, "step": 4370}, {"loss": 1.563, "grad_norm": 0.6342051029205322, "learning_rate": 0.0002, "epoch": 3.6760386067981536, "step": 4380}, {"loss": 1.5444, "grad_norm": 0.6078384518623352, "learning_rate": 0.0002, "epoch": 3.6844313890054554, "step": 4390}, {"loss": 1.5546, "grad_norm": 0.5555588006973267, "learning_rate": 0.0002, "epoch": 3.692824171212757, "step": 4400}, {"loss": 1.5694, "grad_norm": 0.6089665293693542, "learning_rate": 0.0002, "epoch": 3.701216953420059, "step": 4410}, {"loss": 1.5898, "grad_norm": 0.6225191950798035, "learning_rate": 0.0002, "epoch": 3.7096097356273603, "step": 4420}, {"loss": 1.5153, "grad_norm": 0.5642715692520142, "learning_rate": 0.0002, "epoch": 3.718002517834662, "step": 4430}, {"loss": 1.5057, "grad_norm": 0.5703449845314026, "learning_rate": 0.0002, "epoch": 3.726395300041964, "step": 4440}, {"loss": 1.5451, "grad_norm": 0.6029745936393738, "learning_rate": 0.0002, "epoch": 3.7347880822492656, "step": 4450}, {"loss": 1.5044, "grad_norm": 0.7089189887046814, "learning_rate": 0.0002, "epoch": 3.7431808644565674, "step": 4460}, {"loss": 1.4804, "grad_norm": 0.6230936050415039, "learning_rate": 0.0002, "epoch": 3.751573646663869, "step": 4470}, {"loss": 1.567, "grad_norm": 0.5718494653701782, "learning_rate": 0.0002, "epoch": 3.759966428871171, "step": 4480}, {"loss": 1.5612, "grad_norm": 0.5404117703437805, "learning_rate": 0.0002, "epoch": 3.7683592110784723, "step": 4490}, {"loss": 1.4707, "grad_norm": 0.5816529393196106, "learning_rate": 0.0002, "epoch": 3.7767519932857745, "step": 4500}, {"loss": 1.5802, "grad_norm": 0.6314901113510132, "learning_rate": 0.0002, "epoch": 3.785144775493076, "step": 4510}, {"loss": 1.5445, "grad_norm": 0.7639698386192322, "learning_rate": 0.0002, "epoch": 3.7935375577003776, "step": 4520}, {"loss": 1.5718, "grad_norm": 0.5727366209030151, "learning_rate": 0.0002, "epoch": 3.8019303399076794, "step": 4530}, {"loss": 1.5409, "grad_norm": 0.6467128396034241, "learning_rate": 0.0002, "epoch": 3.810323122114981, "step": 4540}, {"loss": 1.5266, "grad_norm": 0.6572837233543396, "learning_rate": 0.0002, "epoch": 3.818715904322283, "step": 4550}, {"loss": 1.5718, "grad_norm": 0.5847418904304504, "learning_rate": 0.0002, "epoch": 3.8271086865295847, "step": 4560}, {"loss": 1.5303, "grad_norm": 0.48820871114730835, "learning_rate": 0.0002, "epoch": 3.8355014687368865, "step": 4570}, {"loss": 1.4911, "grad_norm": 1.2537429332733154, "learning_rate": 0.0002, "epoch": 3.843894250944188, "step": 4580}, {"loss": 1.5522, "grad_norm": 0.6026989221572876, "learning_rate": 0.0002, "epoch": 3.8522870331514896, "step": 4590}, {"loss": 1.5035, "grad_norm": 0.5541417598724365, "learning_rate": 0.0002, "epoch": 3.8606798153587913, "step": 4600}, {"loss": 1.5238, "grad_norm": 0.7668771147727966, "learning_rate": 0.0002, "epoch": 3.869072597566093, "step": 4610}, {"loss": 1.5428, "grad_norm": 0.6181227564811707, "learning_rate": 0.0002, "epoch": 3.877465379773395, "step": 4620}, {"loss": 1.5242, "grad_norm": 0.5842700004577637, "learning_rate": 0.0002, "epoch": 3.8858581619806967, "step": 4630}, {"loss": 1.5501, "grad_norm": 0.5824751257896423, "learning_rate": 0.0002, "epoch": 3.8942509441879984, "step": 4640}, {"loss": 1.4443, "grad_norm": 0.6212735772132874, "learning_rate": 0.0002, "epoch": 3.9026437263952998, "step": 4650}, {"loss": 1.4972, "grad_norm": 0.6123346090316772, "learning_rate": 0.0002, "epoch": 3.911036508602602, "step": 4660}, {"loss": 1.5531, "grad_norm": 0.518662691116333, "learning_rate": 0.0002, "epoch": 3.9194292908099033, "step": 4670}, {"loss": 1.5151, "grad_norm": 0.6963476538658142, "learning_rate": 0.0002, "epoch": 3.927822073017205, "step": 4680}, {"loss": 1.5826, "grad_norm": 0.5192152261734009, "learning_rate": 0.0002, "epoch": 3.936214855224507, "step": 4690}, {"loss": 1.5312, "grad_norm": 0.5820888876914978, "learning_rate": 0.0002, "epoch": 3.9446076374318086, "step": 4700}, {"loss": 1.527, "grad_norm": 0.6320387721061707, "learning_rate": 0.0002, "epoch": 3.9530004196391104, "step": 4710}, {"loss": 1.6006, "grad_norm": 0.6174548268318176, "learning_rate": 0.0002, "epoch": 3.961393201846412, "step": 4720}, {"loss": 1.5581, "grad_norm": 0.6691966652870178, "learning_rate": 0.0002, "epoch": 3.969785984053714, "step": 4730}, {"loss": 1.4762, "grad_norm": 0.5972068309783936, "learning_rate": 0.0002, "epoch": 3.9781787662610153, "step": 4740}, {"loss": 1.4947, "grad_norm": 0.5759536027908325, "learning_rate": 0.0002, "epoch": 3.9865715484683175, "step": 4750}, {"loss": 1.4836, "grad_norm": 0.5886756777763367, "learning_rate": 0.0002, "epoch": 3.994964330675619, "step": 4760}]} +{"epoch": 4.9995803608896345, "step": 5957, "epoch_duration": 1275.609186887741, "total_accumulated_duration": 6467.653718471527, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}, {"eval_loss": 1.8086129426956177, "eval_runtime": 38.0431, "eval_samples_per_second": 13.537, "eval_steps_per_second": 1.709, "epoch": 0.9995803608896349, "step": 1191}, {"loss": 1.6856, "grad_norm": 0.3235187232494354, "learning_rate": 0.0002, "epoch": 1.0071338648762065, "step": 1200}, {"loss": 1.7121, "grad_norm": 0.34884774684906006, "learning_rate": 0.0002, "epoch": 1.0155266470835083, "step": 1210}, {"loss": 1.6779, "grad_norm": 0.3215438425540924, "learning_rate": 0.0002, "epoch": 1.0239194292908098, "step": 1220}, {"loss": 1.6562, "grad_norm": 0.312084823846817, "learning_rate": 0.0002, "epoch": 1.0323122114981116, "step": 1230}, {"loss": 1.7366, "grad_norm": 0.33597758412361145, "learning_rate": 0.0002, "epoch": 1.0407049937054134, "step": 1240}, {"loss": 1.7245, "grad_norm": 0.3421499729156494, "learning_rate": 0.0002, "epoch": 1.0490977759127151, "step": 1250}, {"loss": 1.7331, "grad_norm": 0.3458889126777649, "learning_rate": 0.0002, "epoch": 1.0574905581200167, "step": 1260}, {"loss": 1.6929, "grad_norm": 0.3956579864025116, "learning_rate": 0.0002, "epoch": 1.0658833403273185, "step": 1270}, {"loss": 1.6625, "grad_norm": 0.3217819035053253, "learning_rate": 0.0002, "epoch": 1.0742761225346202, "step": 1280}, {"loss": 1.7488, "grad_norm": 0.31379663944244385, "learning_rate": 0.0002, "epoch": 1.082668904741922, "step": 1290}, {"loss": 1.6331, "grad_norm": 0.37231558561325073, "learning_rate": 0.0002, "epoch": 1.0910616869492236, "step": 1300}, {"loss": 1.6614, "grad_norm": 0.35857918858528137, "learning_rate": 0.0002, "epoch": 1.0994544691565253, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.36637991666793823, "learning_rate": 0.0002, "epoch": 1.1078472513638271, "step": 1320}, {"loss": 1.7245, "grad_norm": 0.3436494469642639, "learning_rate": 0.0002, "epoch": 1.1162400335711289, "step": 1330}, {"loss": 1.6867, "grad_norm": 0.404908150434494, "learning_rate": 0.0002, "epoch": 1.1246328157784307, "step": 1340}, {"loss": 1.7042, "grad_norm": 0.34587544202804565, "learning_rate": 0.0002, "epoch": 1.1330255979857322, "step": 1350}, {"loss": 1.6365, "grad_norm": 0.35142362117767334, "learning_rate": 0.0002, "epoch": 1.141418380193034, "step": 1360}, {"loss": 1.6781, "grad_norm": 0.3511804938316345, "learning_rate": 0.0002, "epoch": 1.1498111624003358, "step": 1370}, {"loss": 1.6824, "grad_norm": 0.3549560308456421, "learning_rate": 0.0002, "epoch": 1.1582039446076373, "step": 1380}, {"loss": 1.7276, "grad_norm": 0.35797521471977234, "learning_rate": 0.0002, "epoch": 1.166596726814939, "step": 1390}, {"loss": 1.7476, "grad_norm": 0.37255269289016724, "learning_rate": 0.0002, "epoch": 1.1749895090222409, "step": 1400}, {"loss": 1.7274, "grad_norm": 0.3680652379989624, "learning_rate": 0.0002, "epoch": 1.1833822912295426, "step": 1410}, {"loss": 1.6751, "grad_norm": 0.400831013917923, "learning_rate": 0.0002, "epoch": 1.1917750734368444, "step": 1420}, {"loss": 1.7961, "grad_norm": 0.39571020007133484, "learning_rate": 0.0002, "epoch": 1.200167855644146, "step": 1430}, {"loss": 1.792, "grad_norm": 0.3843863010406494, "learning_rate": 0.0002, "epoch": 1.2085606378514477, "step": 1440}, {"loss": 1.7072, "grad_norm": 0.3901960551738739, "learning_rate": 0.0002, "epoch": 1.2169534200587495, "step": 1450}, {"loss": 1.6425, "grad_norm": 0.36490726470947266, "learning_rate": 0.0002, "epoch": 1.2253462022660513, "step": 1460}, {"loss": 1.6995, "grad_norm": 0.3739864230155945, "learning_rate": 0.0002, "epoch": 1.2337389844733528, "step": 1470}, {"loss": 1.6795, "grad_norm": 0.39061254262924194, "learning_rate": 0.0002, "epoch": 1.2421317666806546, "step": 1480}, {"loss": 1.6838, "grad_norm": 0.37198659777641296, "learning_rate": 0.0002, "epoch": 1.2505245488879564, "step": 1490}, {"loss": 1.725, "grad_norm": 0.3420586884021759, "learning_rate": 0.0002, "epoch": 1.2589173310952582, "step": 1500}, {"loss": 1.719, "grad_norm": 0.4094347655773163, "learning_rate": 0.0002, "epoch": 1.2673101133025597, "step": 1510}, {"loss": 1.7563, "grad_norm": 0.38997703790664673, "learning_rate": 0.0002, "epoch": 1.2757028955098615, "step": 1520}, {"loss": 1.6651, "grad_norm": 0.35702022910118103, "learning_rate": 0.0002, "epoch": 1.2840956777171633, "step": 1530}, {"loss": 1.6689, "grad_norm": 0.3892163336277008, "learning_rate": 0.0002, "epoch": 1.292488459924465, "step": 1540}, {"loss": 1.7209, "grad_norm": 0.33174318075180054, "learning_rate": 0.0002, "epoch": 1.3008812421317666, "step": 1550}, {"loss": 1.7581, "grad_norm": 0.40701809525489807, "learning_rate": 0.0002, "epoch": 1.3092740243390684, "step": 1560}, {"loss": 1.7229, "grad_norm": 0.36324232816696167, "learning_rate": 0.0002, "epoch": 1.3176668065463701, "step": 1570}, {"loss": 1.6708, "grad_norm": 0.3748789429664612, "learning_rate": 0.0002, "epoch": 1.326059588753672, "step": 1580}, {"loss": 1.67, "grad_norm": 0.40873438119888306, "learning_rate": 0.0002, "epoch": 1.3344523709609737, "step": 1590}, {"loss": 1.7909, "grad_norm": 0.52373206615448, "learning_rate": 0.0002, "epoch": 1.3428451531682752, "step": 1600}, {"loss": 1.7593, "grad_norm": 0.40408164262771606, "learning_rate": 0.0002, "epoch": 1.351237935375577, "step": 1610}, {"loss": 1.7959, "grad_norm": 0.3818126320838928, "learning_rate": 0.0002, "epoch": 1.3596307175828788, "step": 1620}, {"loss": 1.6328, "grad_norm": 0.3457068204879761, "learning_rate": 0.0002, "epoch": 1.3680234997901803, "step": 1630}, {"loss": 1.7017, "grad_norm": 0.33777865767478943, "learning_rate": 0.0002, "epoch": 1.3764162819974821, "step": 1640}, {"loss": 1.7335, "grad_norm": 0.36344218254089355, "learning_rate": 0.0002, "epoch": 1.384809064204784, "step": 1650}, {"loss": 1.7656, "grad_norm": 0.3880128562450409, "learning_rate": 0.0002, "epoch": 1.3932018464120857, "step": 1660}, {"loss": 1.7377, "grad_norm": 0.3906225562095642, "learning_rate": 0.0002, "epoch": 1.4015946286193874, "step": 1670}, {"loss": 1.7041, "grad_norm": 0.35857489705085754, "learning_rate": 0.0002, "epoch": 1.409987410826689, "step": 1680}, {"loss": 1.7175, "grad_norm": 0.3627418279647827, "learning_rate": 0.0002, "epoch": 1.4183801930339908, "step": 1690}, {"loss": 1.6948, "grad_norm": 0.41963326930999756, "learning_rate": 0.0002, "epoch": 1.4267729752412925, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.36280378699302673, "learning_rate": 0.0002, "epoch": 1.435165757448594, "step": 1710}, {"loss": 1.7775, "grad_norm": 0.3868233561515808, "learning_rate": 0.0002, "epoch": 1.4435585396558959, "step": 1720}, {"loss": 1.6963, "grad_norm": 0.3635849356651306, "learning_rate": 0.0002, "epoch": 1.4519513218631976, "step": 1730}, {"loss": 1.7381, "grad_norm": 0.4885194003582001, "learning_rate": 0.0002, "epoch": 1.4603441040704994, "step": 1740}, {"loss": 1.6661, "grad_norm": 0.35194680094718933, "learning_rate": 0.0002, "epoch": 1.4687368862778012, "step": 1750}, {"loss": 1.7841, "grad_norm": 0.34906691312789917, "learning_rate": 0.0002, "epoch": 1.4771296684851027, "step": 1760}, {"loss": 1.7196, "grad_norm": 0.3994184732437134, "learning_rate": 0.0002, "epoch": 1.4855224506924045, "step": 1770}, {"loss": 1.7157, "grad_norm": 0.3599298298358917, "learning_rate": 0.0002, "epoch": 1.4939152328997063, "step": 1780}, {"loss": 1.6966, "grad_norm": 0.3794984221458435, "learning_rate": 0.0002, "epoch": 1.5023080151070078, "step": 1790}, {"loss": 1.7187, "grad_norm": 0.36289724707603455, "learning_rate": 0.0002, "epoch": 1.5107007973143096, "step": 1800}, {"loss": 1.78, "grad_norm": 0.38057321310043335, "learning_rate": 0.0002, "epoch": 1.5190935795216114, "step": 1810}, {"loss": 1.7006, "grad_norm": 0.3771969676017761, "learning_rate": 0.0002, "epoch": 1.5274863617289132, "step": 1820}, {"loss": 1.765, "grad_norm": 0.34788841009140015, "learning_rate": 0.0002, "epoch": 1.535879143936215, "step": 1830}, {"loss": 1.7148, "grad_norm": 0.41352227330207825, "learning_rate": 0.0002, "epoch": 1.5442719261435167, "step": 1840}, {"loss": 1.6654, "grad_norm": 0.35711410641670227, "learning_rate": 0.0002, "epoch": 1.5526647083508183, "step": 1850}, {"loss": 1.6998, "grad_norm": 0.40607622265815735, "learning_rate": 0.0002, "epoch": 1.56105749055812, "step": 1860}, {"loss": 1.713, "grad_norm": 0.3428550660610199, "learning_rate": 0.0002, "epoch": 1.5694502727654216, "step": 1870}, {"loss": 1.7909, "grad_norm": 0.3695414066314697, "learning_rate": 0.0002, "epoch": 1.5778430549727234, "step": 1880}, {"loss": 1.6629, "grad_norm": 0.3798272907733917, "learning_rate": 0.0002, "epoch": 1.5862358371800251, "step": 1890}, {"loss": 1.7412, "grad_norm": 0.3415829837322235, "learning_rate": 0.0002, "epoch": 1.594628619387327, "step": 1900}, {"loss": 1.8233, "grad_norm": 0.3575693666934967, "learning_rate": 0.0002, "epoch": 1.6030214015946287, "step": 1910}, {"loss": 1.6947, "grad_norm": 0.3180370628833771, "learning_rate": 0.0002, "epoch": 1.6114141838019305, "step": 1920}, {"loss": 1.7506, "grad_norm": 0.5018689036369324, "learning_rate": 0.0002, "epoch": 1.619806966009232, "step": 1930}, {"loss": 1.7368, "grad_norm": 0.35676372051239014, "learning_rate": 0.0002, "epoch": 1.6281997482165338, "step": 1940}, {"loss": 1.7159, "grad_norm": 0.3740452229976654, "learning_rate": 0.0002, "epoch": 1.6365925304238353, "step": 1950}, {"loss": 1.6474, "grad_norm": 0.36584731936454773, "learning_rate": 0.0002, "epoch": 1.6449853126311371, "step": 1960}, {"loss": 1.7306, "grad_norm": 0.38556376099586487, "learning_rate": 0.0002, "epoch": 1.653378094838439, "step": 1970}, {"loss": 1.7694, "grad_norm": 0.4114968776702881, "learning_rate": 0.0002, "epoch": 1.6617708770457407, "step": 1980}, {"loss": 1.6407, "grad_norm": 0.3665498197078705, "learning_rate": 0.0002, "epoch": 1.6701636592530424, "step": 1990}, {"loss": 1.7167, "grad_norm": 0.36579379439353943, "learning_rate": 0.0002, "epoch": 1.6785564414603442, "step": 2000}, {"loss": 1.7637, "grad_norm": 0.3813064694404602, "learning_rate": 0.0002, "epoch": 1.6869492236676458, "step": 2010}, {"loss": 1.7566, "grad_norm": 0.33390694856643677, "learning_rate": 0.0002, "epoch": 1.6953420058749475, "step": 2020}, {"loss": 1.6576, "grad_norm": 0.3668614327907562, "learning_rate": 0.0002, "epoch": 1.7037347880822493, "step": 2030}, {"loss": 1.7162, "grad_norm": 0.352028489112854, "learning_rate": 0.0002, "epoch": 1.7121275702895509, "step": 2040}, {"loss": 1.727, "grad_norm": 0.33639830350875854, "learning_rate": 0.0002, "epoch": 1.7205203524968526, "step": 2050}, {"loss": 1.7868, "grad_norm": 0.39217695593833923, "learning_rate": 0.0002, "epoch": 1.7289131347041544, "step": 2060}, {"loss": 1.7608, "grad_norm": 0.42593324184417725, "learning_rate": 0.0002, "epoch": 1.7373059169114562, "step": 2070}, {"loss": 1.722, "grad_norm": 0.362215518951416, "learning_rate": 0.0002, "epoch": 1.745698699118758, "step": 2080}, {"loss": 1.7712, "grad_norm": 0.4087955057621002, "learning_rate": 0.0002, "epoch": 1.7540914813260597, "step": 2090}, {"loss": 1.6414, "grad_norm": 0.35127750039100647, "learning_rate": 0.0002, "epoch": 1.7624842635333613, "step": 2100}, {"loss": 1.7405, "grad_norm": 0.33677494525909424, "learning_rate": 0.0002, "epoch": 1.770877045740663, "step": 2110}, {"loss": 1.7478, "grad_norm": 0.39616644382476807, "learning_rate": 0.0002, "epoch": 1.7792698279479646, "step": 2120}, {"loss": 1.8068, "grad_norm": 0.4705100953578949, "learning_rate": 0.0002, "epoch": 1.7876626101552664, "step": 2130}, {"loss": 1.75, "grad_norm": 0.3893914818763733, "learning_rate": 0.0002, "epoch": 1.7960553923625682, "step": 2140}, {"loss": 1.6711, "grad_norm": 0.3344813585281372, "learning_rate": 0.0002, "epoch": 1.80444817456987, "step": 2150}, {"loss": 1.8329, "grad_norm": 0.36502110958099365, "learning_rate": 0.0002, "epoch": 1.8128409567771717, "step": 2160}, {"loss": 1.753, "grad_norm": 0.3422985374927521, "learning_rate": 0.0002, "epoch": 1.8212337389844735, "step": 2170}, {"loss": 1.6874, "grad_norm": 0.44039851427078247, "learning_rate": 0.0002, "epoch": 1.829626521191775, "step": 2180}, {"loss": 1.7706, "grad_norm": 0.40052926540374756, "learning_rate": 0.0002, "epoch": 1.8380193033990768, "step": 2190}, {"loss": 1.7551, "grad_norm": 0.3614487648010254, "learning_rate": 0.0002, "epoch": 1.8464120856063784, "step": 2200}, {"loss": 1.6879, "grad_norm": 0.3800305426120758, "learning_rate": 0.0002, "epoch": 1.8548048678136801, "step": 2210}, {"loss": 1.7731, "grad_norm": 0.3942040205001831, "learning_rate": 0.0002, "epoch": 1.863197650020982, "step": 2220}, {"loss": 1.7187, "grad_norm": 0.36896875500679016, "learning_rate": 0.0002, "epoch": 1.8715904322282837, "step": 2230}, {"loss": 1.7371, "grad_norm": 0.3666089177131653, "learning_rate": 0.0002, "epoch": 1.8799832144355855, "step": 2240}, {"loss": 1.7336, "grad_norm": 0.3759142756462097, "learning_rate": 0.0002, "epoch": 1.8883759966428872, "step": 2250}, {"loss": 1.7243, "grad_norm": 0.3711695671081543, "learning_rate": 0.0002, "epoch": 1.8967687788501888, "step": 2260}, {"loss": 1.7052, "grad_norm": 0.37000006437301636, "learning_rate": 0.0002, "epoch": 1.9051615610574906, "step": 2270}, {"loss": 1.7104, "grad_norm": 0.37376025319099426, "learning_rate": 0.0002, "epoch": 1.9135543432647921, "step": 2280}, {"loss": 1.6641, "grad_norm": 0.3794068694114685, "learning_rate": 0.0002, "epoch": 1.921947125472094, "step": 2290}, {"loss": 1.7693, "grad_norm": 0.42530709505081177, "learning_rate": 0.0002, "epoch": 1.9303399076793957, "step": 2300}, {"loss": 1.7871, "grad_norm": 0.3381672203540802, "learning_rate": 0.0002, "epoch": 1.9387326898866974, "step": 2310}, {"loss": 1.7502, "grad_norm": 0.3553236722946167, "learning_rate": 0.0002, "epoch": 1.9471254720939992, "step": 2320}, {"loss": 1.715, "grad_norm": 0.38204774260520935, "learning_rate": 0.0002, "epoch": 1.955518254301301, "step": 2330}, {"loss": 1.7088, "grad_norm": 0.4318946301937103, "learning_rate": 0.0002, "epoch": 1.9639110365086025, "step": 2340}, {"loss": 1.7709, "grad_norm": 0.3563119173049927, "learning_rate": 0.0002, "epoch": 1.9723038187159043, "step": 2350}, {"loss": 1.7083, "grad_norm": 0.362532377243042, "learning_rate": 0.0002, "epoch": 1.980696600923206, "step": 2360}, {"loss": 1.6992, "grad_norm": 0.40200483798980713, "learning_rate": 0.0002, "epoch": 1.9890893831305076, "step": 2370}, {"loss": 1.7622, "grad_norm": 0.37397003173828125, "learning_rate": 0.0002, "epoch": 1.9974821653378094, "step": 2380}, {"eval_loss": 1.807437539100647, "eval_runtime": 38.0038, "eval_samples_per_second": 13.551, "eval_steps_per_second": 1.71, "epoch": 2.0, "step": 2383}, {"loss": 1.579, "grad_norm": 0.3563518226146698, "learning_rate": 0.0002, "epoch": 2.005874947545111, "step": 2390}, {"loss": 1.5467, "grad_norm": 0.3913732171058655, "learning_rate": 0.0002, "epoch": 2.014267729752413, "step": 2400}, {"loss": 1.6202, "grad_norm": 0.3511047661304474, "learning_rate": 0.0002, "epoch": 2.0226605119597147, "step": 2410}, {"loss": 1.599, "grad_norm": 0.3917897641658783, "learning_rate": 0.0002, "epoch": 2.0310532941670165, "step": 2420}, {"loss": 1.663, "grad_norm": 0.36766913533210754, "learning_rate": 0.0002, "epoch": 2.0394460763743183, "step": 2430}, {"loss": 1.5608, "grad_norm": 0.434097021818161, "learning_rate": 0.0002, "epoch": 2.0478388585816196, "step": 2440}, {"loss": 1.6199, "grad_norm": 0.4986756145954132, "learning_rate": 0.0002, "epoch": 2.0562316407889214, "step": 2450}, {"loss": 1.6224, "grad_norm": 0.4377020001411438, "learning_rate": 0.0002, "epoch": 2.064624422996223, "step": 2460}, {"loss": 1.6047, "grad_norm": 0.4412095546722412, "learning_rate": 0.0002, "epoch": 2.073017205203525, "step": 2470}, {"loss": 1.6766, "grad_norm": 0.4463737905025482, "learning_rate": 0.0002, "epoch": 2.0814099874108267, "step": 2480}, {"loss": 1.6666, "grad_norm": 0.4118853211402893, "learning_rate": 0.0002, "epoch": 2.0898027696181285, "step": 2490}, {"loss": 1.6384, "grad_norm": 0.48814308643341064, "learning_rate": 0.0002, "epoch": 2.0981955518254303, "step": 2500}, {"loss": 1.6292, "grad_norm": 0.4263038635253906, "learning_rate": 0.0002, "epoch": 2.106588334032732, "step": 2510}, {"loss": 1.5907, "grad_norm": 0.41060999035835266, "learning_rate": 0.0002, "epoch": 2.1149811162400334, "step": 2520}, {"loss": 1.685, "grad_norm": 0.4699285626411438, "learning_rate": 0.0002, "epoch": 2.123373898447335, "step": 2530}, {"loss": 1.6076, "grad_norm": 0.4321298897266388, "learning_rate": 0.0002, "epoch": 2.131766680654637, "step": 2540}, {"loss": 1.5715, "grad_norm": 0.41544368863105774, "learning_rate": 0.0002, "epoch": 2.1401594628619387, "step": 2550}, {"loss": 1.6717, "grad_norm": 0.4529191851615906, "learning_rate": 0.0002, "epoch": 2.1485522450692405, "step": 2560}, {"loss": 1.7014, "grad_norm": 0.4370215833187103, "learning_rate": 0.0002, "epoch": 2.1569450272765422, "step": 2570}, {"loss": 1.55, "grad_norm": 0.3878629207611084, "learning_rate": 0.0002, "epoch": 2.165337809483844, "step": 2580}, {"loss": 1.6863, "grad_norm": 0.47374191880226135, "learning_rate": 0.0002, "epoch": 2.173730591691146, "step": 2590}, {"loss": 1.6462, "grad_norm": 0.4551556706428528, "learning_rate": 0.0002, "epoch": 2.182123373898447, "step": 2600}, {"loss": 1.6238, "grad_norm": 0.45371633768081665, "learning_rate": 0.0002, "epoch": 2.190516156105749, "step": 2610}, {"loss": 1.6134, "grad_norm": 0.3831859529018402, "learning_rate": 0.0002, "epoch": 2.1989089383130507, "step": 2620}, {"loss": 1.6477, "grad_norm": 0.42436569929122925, "learning_rate": 0.0002, "epoch": 2.2073017205203525, "step": 2630}, {"loss": 1.6512, "grad_norm": 0.4363750219345093, "learning_rate": 0.0002, "epoch": 2.2156945027276542, "step": 2640}, {"loss": 1.6978, "grad_norm": 0.4473390579223633, "learning_rate": 0.0002, "epoch": 2.224087284934956, "step": 2650}, {"loss": 1.6161, "grad_norm": 0.4419533908367157, "learning_rate": 0.0002, "epoch": 2.2324800671422578, "step": 2660}, {"loss": 1.6415, "grad_norm": 0.525901198387146, "learning_rate": 0.0002, "epoch": 2.2408728493495595, "step": 2670}, {"loss": 1.6891, "grad_norm": 0.4345211684703827, "learning_rate": 0.0002, "epoch": 2.2492656315568613, "step": 2680}, {"loss": 1.5951, "grad_norm": 0.5169841051101685, "learning_rate": 0.0002, "epoch": 2.2576584137641627, "step": 2690}, {"loss": 1.6221, "grad_norm": 0.43511003255844116, "learning_rate": 0.0002, "epoch": 2.2660511959714644, "step": 2700}, {"loss": 1.6084, "grad_norm": 0.4781411588191986, "learning_rate": 0.0002, "epoch": 2.274443978178766, "step": 2710}, {"loss": 1.6292, "grad_norm": 0.4282242953777313, "learning_rate": 0.0002, "epoch": 2.282836760386068, "step": 2720}, {"loss": 1.5238, "grad_norm": 0.4499875605106354, "learning_rate": 0.0002, "epoch": 2.2912295425933698, "step": 2730}, {"loss": 1.5844, "grad_norm": 0.4133218824863434, "learning_rate": 0.0002, "epoch": 2.2996223248006715, "step": 2740}, {"loss": 1.6207, "grad_norm": 0.4706156849861145, "learning_rate": 0.0002, "epoch": 2.3080151070079733, "step": 2750}, {"loss": 1.573, "grad_norm": 0.4537484347820282, "learning_rate": 0.0002, "epoch": 2.3164078892152746, "step": 2760}, {"loss": 1.6556, "grad_norm": 0.39736735820770264, "learning_rate": 0.0002, "epoch": 2.3248006714225764, "step": 2770}, {"loss": 1.7032, "grad_norm": 0.4488453269004822, "learning_rate": 0.0002, "epoch": 2.333193453629878, "step": 2780}, {"loss": 1.6169, "grad_norm": 0.44405487179756165, "learning_rate": 0.0002, "epoch": 2.34158623583718, "step": 2790}, {"loss": 1.5207, "grad_norm": 0.4726555049419403, "learning_rate": 0.0002, "epoch": 2.3499790180444817, "step": 2800}, {"loss": 1.5792, "grad_norm": 0.4820375442504883, "learning_rate": 0.0002, "epoch": 2.3583718002517835, "step": 2810}, {"loss": 1.5774, "grad_norm": 0.46176597476005554, "learning_rate": 0.0002, "epoch": 2.3667645824590853, "step": 2820}, {"loss": 1.6256, "grad_norm": 0.4603394567966461, "learning_rate": 0.0002, "epoch": 2.375157364666387, "step": 2830}, {"loss": 1.6598, "grad_norm": 0.4462946355342865, "learning_rate": 0.0002, "epoch": 2.383550146873689, "step": 2840}, {"loss": 1.5939, "grad_norm": 0.5216080546379089, "learning_rate": 0.0002, "epoch": 2.39194292908099, "step": 2850}, {"loss": 1.5981, "grad_norm": 0.44553086161613464, "learning_rate": 0.0002, "epoch": 2.400335711288292, "step": 2860}, {"loss": 1.6556, "grad_norm": 0.4215725362300873, "learning_rate": 0.0002, "epoch": 2.4087284934955937, "step": 2870}, {"loss": 1.6228, "grad_norm": 0.4646450877189636, "learning_rate": 0.0002, "epoch": 2.4171212757028955, "step": 2880}, {"loss": 1.6547, "grad_norm": 0.44749370217323303, "learning_rate": 0.0002, "epoch": 2.4255140579101973, "step": 2890}, {"loss": 1.6356, "grad_norm": 0.4986693859100342, "learning_rate": 0.0002, "epoch": 2.433906840117499, "step": 2900}, {"loss": 1.6294, "grad_norm": 0.4607609808444977, "learning_rate": 0.0002, "epoch": 2.442299622324801, "step": 2910}, {"loss": 1.6721, "grad_norm": 0.4597654938697815, "learning_rate": 0.0002, "epoch": 2.4506924045321026, "step": 2920}, {"loss": 1.7428, "grad_norm": 0.4106820821762085, "learning_rate": 0.0002, "epoch": 2.4590851867394043, "step": 2930}, {"loss": 1.622, "grad_norm": 0.4531514048576355, "learning_rate": 0.0002, "epoch": 2.4674779689467057, "step": 2940}, {"loss": 1.6367, "grad_norm": 0.4546769857406616, "learning_rate": 0.0002, "epoch": 2.4758707511540075, "step": 2950}, {"loss": 1.6306, "grad_norm": 0.47410622239112854, "learning_rate": 0.0002, "epoch": 2.4842635333613092, "step": 2960}, {"loss": 1.6597, "grad_norm": 0.4498177468776703, "learning_rate": 0.0002, "epoch": 2.492656315568611, "step": 2970}, {"loss": 1.6845, "grad_norm": 0.47267791628837585, "learning_rate": 0.0002, "epoch": 2.5010490977759128, "step": 2980}, {"loss": 1.601, "grad_norm": 0.4340207576751709, "learning_rate": 0.0002, "epoch": 2.5094418799832146, "step": 2990}, {"loss": 1.5783, "grad_norm": 0.43454936146736145, "learning_rate": 0.0002, "epoch": 2.5178346621905163, "step": 3000}, {"loss": 1.5773, "grad_norm": 0.43459394574165344, "learning_rate": 0.0002, "epoch": 2.5262274443978177, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4716770052909851, "learning_rate": 0.0002, "epoch": 2.5346202266051194, "step": 3020}, {"loss": 1.626, "grad_norm": 0.4339194595813751, "learning_rate": 0.0002, "epoch": 2.543013008812421, "step": 3030}, {"loss": 1.6053, "grad_norm": 0.4655593931674957, "learning_rate": 0.0002, "epoch": 2.551405791019723, "step": 3040}, {"loss": 1.5871, "grad_norm": 0.5480475425720215, "learning_rate": 0.0002, "epoch": 2.5597985732270248, "step": 3050}, {"loss": 1.7056, "grad_norm": 0.4783174991607666, "learning_rate": 0.0002, "epoch": 2.5681913554343265, "step": 3060}, {"loss": 1.5691, "grad_norm": 0.45062026381492615, "learning_rate": 0.0002, "epoch": 2.5765841376416283, "step": 3070}, {"loss": 1.7005, "grad_norm": 0.4559392035007477, "learning_rate": 0.0002, "epoch": 2.58497691984893, "step": 3080}, {"loss": 1.6414, "grad_norm": 0.6581618785858154, "learning_rate": 0.0002, "epoch": 2.593369702056232, "step": 3090}, {"loss": 1.6707, "grad_norm": 0.48549333214759827, "learning_rate": 0.0002, "epoch": 2.601762484263533, "step": 3100}, {"loss": 1.6128, "grad_norm": 0.5358436107635498, "learning_rate": 0.0002, "epoch": 2.610155266470835, "step": 3110}, {"loss": 1.6507, "grad_norm": 0.5380043983459473, "learning_rate": 0.0002, "epoch": 2.6185480486781367, "step": 3120}, {"loss": 1.6394, "grad_norm": 0.49887847900390625, "learning_rate": 0.0002, "epoch": 2.6269408308854385, "step": 3130}, {"loss": 1.6464, "grad_norm": 0.46039602160453796, "learning_rate": 0.0002, "epoch": 2.6353336130927403, "step": 3140}, {"loss": 1.6337, "grad_norm": 0.416098952293396, "learning_rate": 0.0002, "epoch": 2.643726395300042, "step": 3150}, {"loss": 1.6295, "grad_norm": 0.465326726436615, "learning_rate": 0.0002, "epoch": 2.652119177507344, "step": 3160}, {"loss": 1.5806, "grad_norm": 0.47029924392700195, "learning_rate": 0.0002, "epoch": 2.660511959714645, "step": 3170}, {"loss": 1.6268, "grad_norm": 0.5063307285308838, "learning_rate": 0.0002, "epoch": 2.6689047419219474, "step": 3180}, {"loss": 1.5718, "grad_norm": 0.42928868532180786, "learning_rate": 0.0002, "epoch": 2.6772975241292487, "step": 3190}, {"loss": 1.6113, "grad_norm": 0.4170134365558624, "learning_rate": 0.0002, "epoch": 2.6856903063365505, "step": 3200}, {"loss": 1.6337, "grad_norm": 0.47810474038124084, "learning_rate": 0.0002, "epoch": 2.6940830885438523, "step": 3210}, {"loss": 1.6808, "grad_norm": 0.44440609216690063, "learning_rate": 0.0002, "epoch": 2.702475870751154, "step": 3220}, {"loss": 1.5611, "grad_norm": 0.482759565114975, "learning_rate": 0.0002, "epoch": 2.710868652958456, "step": 3230}, {"loss": 1.6265, "grad_norm": 0.4325942099094391, "learning_rate": 0.0002, "epoch": 2.7192614351657576, "step": 3240}, {"loss": 1.585, "grad_norm": 0.502498984336853, "learning_rate": 0.0002, "epoch": 2.7276542173730594, "step": 3250}, {"loss": 1.7179, "grad_norm": 0.4725162982940674, "learning_rate": 0.0002, "epoch": 2.7360469995803607, "step": 3260}, {"loss": 1.6591, "grad_norm": 0.46781349182128906, "learning_rate": 0.0002, "epoch": 2.7444397817876625, "step": 3270}, {"loss": 1.6625, "grad_norm": 0.47366851568222046, "learning_rate": 0.0002, "epoch": 2.7528325639949642, "step": 3280}, {"loss": 1.6437, "grad_norm": 0.5101882815361023, "learning_rate": 0.0002, "epoch": 2.761225346202266, "step": 3290}, {"loss": 1.6488, "grad_norm": 0.4874587059020996, "learning_rate": 0.0002, "epoch": 2.769618128409568, "step": 3300}, {"loss": 1.6151, "grad_norm": 0.4989369213581085, "learning_rate": 0.0002, "epoch": 2.7780109106168696, "step": 3310}, {"loss": 1.6786, "grad_norm": 0.48041442036628723, "learning_rate": 0.0002, "epoch": 2.7864036928241713, "step": 3320}, {"loss": 1.6137, "grad_norm": 0.4845651090145111, "learning_rate": 0.0002, "epoch": 2.7947964750314727, "step": 3330}, {"loss": 1.7154, "grad_norm": 0.48575496673583984, "learning_rate": 0.0002, "epoch": 2.803189257238775, "step": 3340}, {"loss": 1.6771, "grad_norm": 0.509726881980896, "learning_rate": 0.0002, "epoch": 2.811582039446076, "step": 3350}, {"loss": 1.6937, "grad_norm": 0.5026665329933167, "learning_rate": 0.0002, "epoch": 2.819974821653378, "step": 3360}, {"loss": 1.623, "grad_norm": 0.4727601706981659, "learning_rate": 0.0002, "epoch": 2.8283676038606798, "step": 3370}, {"loss": 1.6811, "grad_norm": 0.41952234506607056, "learning_rate": 0.0002, "epoch": 2.8367603860679815, "step": 3380}, {"loss": 1.6639, "grad_norm": 0.49663856625556946, "learning_rate": 0.0002, "epoch": 2.8451531682752833, "step": 3390}, {"loss": 1.6389, "grad_norm": 0.4934511184692383, "learning_rate": 0.0002, "epoch": 2.853545950482585, "step": 3400}, {"loss": 1.6362, "grad_norm": 0.4673226773738861, "learning_rate": 0.0002, "epoch": 2.861938732689887, "step": 3410}, {"loss": 1.641, "grad_norm": 0.48972779512405396, "learning_rate": 0.0002, "epoch": 2.870331514897188, "step": 3420}, {"loss": 1.6047, "grad_norm": 0.5008330345153809, "learning_rate": 0.0002, "epoch": 2.8787242971044904, "step": 3430}, {"loss": 1.6867, "grad_norm": 0.43337664008140564, "learning_rate": 0.0002, "epoch": 2.8871170793117917, "step": 3440}, {"loss": 1.5501, "grad_norm": 0.4430622458457947, "learning_rate": 0.0002, "epoch": 2.8955098615190935, "step": 3450}, {"loss": 1.6415, "grad_norm": 0.45123326778411865, "learning_rate": 0.0002, "epoch": 2.9039026437263953, "step": 3460}, {"loss": 1.5913, "grad_norm": 0.47367340326309204, "learning_rate": 0.0002, "epoch": 2.912295425933697, "step": 3470}, {"loss": 1.5951, "grad_norm": 0.44940701127052307, "learning_rate": 0.0002, "epoch": 2.920688208140999, "step": 3480}, {"loss": 1.6343, "grad_norm": 0.44216281175613403, "learning_rate": 0.0002, "epoch": 2.9290809903483006, "step": 3490}, {"loss": 1.6088, "grad_norm": 0.4824782609939575, "learning_rate": 0.0002, "epoch": 2.9374737725556024, "step": 3500}, {"loss": 1.5949, "grad_norm": 0.43067067861557007, "learning_rate": 0.0002, "epoch": 2.9458665547629037, "step": 3510}, {"loss": 1.547, "grad_norm": 0.46483176946640015, "learning_rate": 0.0002, "epoch": 2.9542593369702055, "step": 3520}, {"loss": 1.5878, "grad_norm": 0.49230799078941345, "learning_rate": 0.0002, "epoch": 2.9626521191775073, "step": 3530}, {"loss": 1.5925, "grad_norm": 0.5081011652946472, "learning_rate": 0.0002, "epoch": 2.971044901384809, "step": 3540}, {"loss": 1.7402, "grad_norm": 0.5326072573661804, "learning_rate": 0.0002, "epoch": 2.979437683592111, "step": 3550}, {"loss": 1.5769, "grad_norm": 0.4981454014778137, "learning_rate": 0.0002, "epoch": 2.9878304657994126, "step": 3560}, {"loss": 1.6073, "grad_norm": 0.4330528676509857, "learning_rate": 0.0002, "epoch": 2.9962232480067144, "step": 3570}, {"eval_loss": 1.824695348739624, "eval_runtime": 37.947, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.999580360889635, "step": 3574}, {"loss": 1.5633, "grad_norm": 0.4380604326725006, "learning_rate": 0.0002, "epoch": 3.004616030214016, "step": 3580}, {"loss": 1.4474, "grad_norm": 0.5375564098358154, "learning_rate": 0.0002, "epoch": 3.0130088124213175, "step": 3590}, {"loss": 1.5738, "grad_norm": 0.50722736120224, "learning_rate": 0.0002, "epoch": 3.0214015946286192, "step": 3600}, {"loss": 1.5191, "grad_norm": 0.5398766994476318, "learning_rate": 0.0002, "epoch": 3.029794376835921, "step": 3610}, {"loss": 1.4401, "grad_norm": 0.520709753036499, "learning_rate": 0.0002, "epoch": 3.038187159043223, "step": 3620}, {"loss": 1.5704, "grad_norm": 0.5429664850234985, "learning_rate": 0.0002, "epoch": 3.0465799412505246, "step": 3630}, {"loss": 1.5516, "grad_norm": 0.5634943842887878, "learning_rate": 0.0002, "epoch": 3.0549727234578263, "step": 3640}, {"loss": 1.5349, "grad_norm": 0.5042277574539185, "learning_rate": 0.0002, "epoch": 3.063365505665128, "step": 3650}, {"loss": 1.4708, "grad_norm": 0.5778711438179016, "learning_rate": 0.0002, "epoch": 3.07175828787243, "step": 3660}, {"loss": 1.5196, "grad_norm": 0.5504926443099976, "learning_rate": 0.0002, "epoch": 3.080151070079731, "step": 3670}, {"loss": 1.473, "grad_norm": 0.5199463963508606, "learning_rate": 0.0002, "epoch": 3.088543852287033, "step": 3680}, {"loss": 1.5064, "grad_norm": 0.552334189414978, "learning_rate": 0.0002, "epoch": 3.0969366344943348, "step": 3690}, {"loss": 1.4638, "grad_norm": 0.5650873780250549, "learning_rate": 0.0002, "epoch": 3.1053294167016365, "step": 3700}, {"loss": 1.4945, "grad_norm": 0.6292349696159363, "learning_rate": 0.0002, "epoch": 3.1137221989089383, "step": 3710}, {"loss": 1.4787, "grad_norm": 0.5523604154586792, "learning_rate": 0.0002, "epoch": 3.12211498111624, "step": 3720}, {"loss": 1.4697, "grad_norm": 0.6160100698471069, "learning_rate": 0.0002, "epoch": 3.130507763323542, "step": 3730}, {"loss": 1.5589, "grad_norm": 0.6091629266738892, "learning_rate": 0.0002, "epoch": 3.1389005455308436, "step": 3740}, {"loss": 1.4659, "grad_norm": 0.5695531964302063, "learning_rate": 0.0002, "epoch": 3.1472933277381454, "step": 3750}, {"loss": 1.4605, "grad_norm": 0.569611132144928, "learning_rate": 0.0002, "epoch": 3.1556861099454467, "step": 3760}, {"loss": 1.4592, "grad_norm": 0.5761140584945679, "learning_rate": 0.0002, "epoch": 3.1640788921527485, "step": 3770}, {"loss": 1.4999, "grad_norm": 0.6855548620223999, "learning_rate": 0.0002, "epoch": 3.1724716743600503, "step": 3780}, {"loss": 1.5047, "grad_norm": 0.5815101265907288, "learning_rate": 0.0002, "epoch": 3.180864456567352, "step": 3790}, {"loss": 1.5289, "grad_norm": 0.6179960370063782, "learning_rate": 0.0002, "epoch": 3.189257238774654, "step": 3800}, {"loss": 1.4833, "grad_norm": 0.5418674349784851, "learning_rate": 0.0002, "epoch": 3.1976500209819556, "step": 3810}, {"loss": 1.4994, "grad_norm": 0.5655816197395325, "learning_rate": 0.0002, "epoch": 3.2060428031892574, "step": 3820}, {"loss": 1.5007, "grad_norm": 0.7279291152954102, "learning_rate": 0.0002, "epoch": 3.214435585396559, "step": 3830}, {"loss": 1.5672, "grad_norm": 0.490998238325119, "learning_rate": 0.0002, "epoch": 3.2228283676038605, "step": 3840}, {"loss": 1.4683, "grad_norm": 0.6065797209739685, "learning_rate": 0.0002, "epoch": 3.2312211498111623, "step": 3850}, {"loss": 1.5153, "grad_norm": 0.6024682521820068, "learning_rate": 0.0002, "epoch": 3.239613932018464, "step": 3860}, {"loss": 1.5123, "grad_norm": 0.5571125745773315, "learning_rate": 0.0002, "epoch": 3.248006714225766, "step": 3870}, {"loss": 1.4609, "grad_norm": 0.5662134289741516, "learning_rate": 0.0002, "epoch": 3.2563994964330676, "step": 3880}, {"loss": 1.5452, "grad_norm": 0.5936661958694458, "learning_rate": 0.0002, "epoch": 3.2647922786403694, "step": 3890}, {"loss": 1.5149, "grad_norm": 0.6739671230316162, "learning_rate": 0.0002, "epoch": 3.273185060847671, "step": 3900}, {"loss": 1.5101, "grad_norm": 0.5579532384872437, "learning_rate": 0.0002, "epoch": 3.281577843054973, "step": 3910}, {"loss": 1.4788, "grad_norm": 0.6595954298973083, "learning_rate": 0.0002, "epoch": 3.2899706252622742, "step": 3920}, {"loss": 1.473, "grad_norm": 0.5712262988090515, "learning_rate": 0.0002, "epoch": 3.298363407469576, "step": 3930}, {"loss": 1.5512, "grad_norm": 0.5601761341094971, "learning_rate": 0.0002, "epoch": 3.306756189676878, "step": 3940}, {"loss": 1.4904, "grad_norm": 0.5759967565536499, "learning_rate": 0.0002, "epoch": 3.3151489718841796, "step": 3950}, {"loss": 1.4885, "grad_norm": 0.6543047428131104, "learning_rate": 0.0002, "epoch": 3.3235417540914813, "step": 3960}, {"loss": 1.5063, "grad_norm": 0.6355253458023071, "learning_rate": 0.0002, "epoch": 3.331934536298783, "step": 3970}, {"loss": 1.5025, "grad_norm": 0.5671007633209229, "learning_rate": 0.0002, "epoch": 3.340327318506085, "step": 3980}, {"loss": 1.5049, "grad_norm": 0.6743636727333069, "learning_rate": 0.0002, "epoch": 3.3487201007133867, "step": 3990}, {"loss": 1.5527, "grad_norm": 0.500627338886261, "learning_rate": 0.0002, "epoch": 3.3571128829206884, "step": 4000}, {"loss": 1.4884, "grad_norm": 0.5666340589523315, "learning_rate": 0.0002, "epoch": 3.3655056651279898, "step": 4010}, {"loss": 1.5104, "grad_norm": 0.5651408433914185, "learning_rate": 0.0002, "epoch": 3.3738984473352915, "step": 4020}, {"loss": 1.4907, "grad_norm": 0.6338897943496704, "learning_rate": 0.0002, "epoch": 3.3822912295425933, "step": 4030}, {"loss": 1.553, "grad_norm": 0.5781935453414917, "learning_rate": 0.0002, "epoch": 3.390684011749895, "step": 4040}, {"loss": 1.5535, "grad_norm": 0.55543053150177, "learning_rate": 0.0002, "epoch": 3.399076793957197, "step": 4050}, {"loss": 1.4884, "grad_norm": 0.6602614521980286, "learning_rate": 0.0002, "epoch": 3.4074695761644986, "step": 4060}, {"loss": 1.471, "grad_norm": 0.5514156222343445, "learning_rate": 0.0002, "epoch": 3.4158623583718004, "step": 4070}, {"loss": 1.4634, "grad_norm": 0.5760560035705566, "learning_rate": 0.0002, "epoch": 3.4242551405791017, "step": 4080}, {"loss": 1.4662, "grad_norm": 0.657503604888916, "learning_rate": 0.0002, "epoch": 3.4326479227864035, "step": 4090}, {"loss": 1.5041, "grad_norm": 0.5746736526489258, "learning_rate": 0.0002, "epoch": 3.4410407049937053, "step": 4100}, {"loss": 1.4387, "grad_norm": 0.5988999009132385, "learning_rate": 0.0002, "epoch": 3.449433487201007, "step": 4110}, {"loss": 1.5475, "grad_norm": 0.7294586300849915, "learning_rate": 0.0002, "epoch": 3.457826269408309, "step": 4120}, {"loss": 1.4878, "grad_norm": 0.6391161680221558, "learning_rate": 0.0002, "epoch": 3.4662190516156106, "step": 4130}, {"loss": 1.5366, "grad_norm": 0.6416470408439636, "learning_rate": 0.0002, "epoch": 3.4746118338229124, "step": 4140}, {"loss": 1.5587, "grad_norm": 0.5710626244544983, "learning_rate": 0.0002, "epoch": 3.483004616030214, "step": 4150}, {"loss": 1.4661, "grad_norm": 0.5370054841041565, "learning_rate": 0.0002, "epoch": 3.491397398237516, "step": 4160}, {"loss": 1.5167, "grad_norm": 0.5559558272361755, "learning_rate": 0.0002, "epoch": 3.4997901804448173, "step": 4170}, {"loss": 1.4244, "grad_norm": 0.5426168441772461, "learning_rate": 0.0002, "epoch": 3.508182962652119, "step": 4180}, {"loss": 1.5241, "grad_norm": 0.5997438430786133, "learning_rate": 0.0002, "epoch": 3.516575744859421, "step": 4190}, {"loss": 1.6091, "grad_norm": 0.5399143099784851, "learning_rate": 0.0002, "epoch": 3.5249685270667226, "step": 4200}, {"loss": 1.5066, "grad_norm": 0.6341416239738464, "learning_rate": 0.0002, "epoch": 3.5333613092740244, "step": 4210}, {"loss": 1.5436, "grad_norm": 0.632238507270813, "learning_rate": 0.0002, "epoch": 3.541754091481326, "step": 4220}, {"loss": 1.5423, "grad_norm": 0.6356478333473206, "learning_rate": 0.0002, "epoch": 3.550146873688628, "step": 4230}, {"loss": 1.483, "grad_norm": 0.6379408240318298, "learning_rate": 0.0002, "epoch": 3.5585396558959292, "step": 4240}, {"loss": 1.5184, "grad_norm": 0.6265586018562317, "learning_rate": 0.0002, "epoch": 3.5669324381032315, "step": 4250}, {"loss": 1.5047, "grad_norm": 0.5378820896148682, "learning_rate": 0.0002, "epoch": 3.575325220310533, "step": 4260}, {"loss": 1.5668, "grad_norm": 0.6800801753997803, "learning_rate": 0.0002, "epoch": 3.5837180025178346, "step": 4270}, {"loss": 1.5363, "grad_norm": 0.5653113126754761, "learning_rate": 0.0002, "epoch": 3.5921107847251363, "step": 4280}, {"loss": 1.5007, "grad_norm": 0.548647940158844, "learning_rate": 0.0002, "epoch": 3.600503566932438, "step": 4290}, {"loss": 1.5034, "grad_norm": 0.5729944705963135, "learning_rate": 0.0002, "epoch": 3.60889634913974, "step": 4300}, {"loss": 1.575, "grad_norm": 0.6204999685287476, "learning_rate": 0.0002, "epoch": 3.6172891313470417, "step": 4310}, {"loss": 1.5107, "grad_norm": 0.6275812983512878, "learning_rate": 0.0002, "epoch": 3.6256819135543434, "step": 4320}, {"loss": 1.5013, "grad_norm": 0.7261835336685181, "learning_rate": 0.0002, "epoch": 3.6340746957616448, "step": 4330}, {"loss": 1.5128, "grad_norm": 0.6048004627227783, "learning_rate": 0.0002, "epoch": 3.6424674779689465, "step": 4340}, {"loss": 1.5106, "grad_norm": 0.5879671573638916, "learning_rate": 0.0002, "epoch": 3.6508602601762483, "step": 4350}, {"loss": 1.5477, "grad_norm": 0.6001018285751343, "learning_rate": 0.0002, "epoch": 3.65925304238355, "step": 4360}, {"loss": 1.5247, "grad_norm": 0.6468151211738586, "learning_rate": 0.0002, "epoch": 3.667645824590852, "step": 4370}, {"loss": 1.563, "grad_norm": 0.6342051029205322, "learning_rate": 0.0002, "epoch": 3.6760386067981536, "step": 4380}, {"loss": 1.5444, "grad_norm": 0.6078384518623352, "learning_rate": 0.0002, "epoch": 3.6844313890054554, "step": 4390}, {"loss": 1.5546, "grad_norm": 0.5555588006973267, "learning_rate": 0.0002, "epoch": 3.692824171212757, "step": 4400}, {"loss": 1.5694, "grad_norm": 0.6089665293693542, "learning_rate": 0.0002, "epoch": 3.701216953420059, "step": 4410}, {"loss": 1.5898, "grad_norm": 0.6225191950798035, "learning_rate": 0.0002, "epoch": 3.7096097356273603, "step": 4420}, {"loss": 1.5153, "grad_norm": 0.5642715692520142, "learning_rate": 0.0002, "epoch": 3.718002517834662, "step": 4430}, {"loss": 1.5057, "grad_norm": 0.5703449845314026, "learning_rate": 0.0002, "epoch": 3.726395300041964, "step": 4440}, {"loss": 1.5451, "grad_norm": 0.6029745936393738, "learning_rate": 0.0002, "epoch": 3.7347880822492656, "step": 4450}, {"loss": 1.5044, "grad_norm": 0.7089189887046814, "learning_rate": 0.0002, "epoch": 3.7431808644565674, "step": 4460}, {"loss": 1.4804, "grad_norm": 0.6230936050415039, "learning_rate": 0.0002, "epoch": 3.751573646663869, "step": 4470}, {"loss": 1.567, "grad_norm": 0.5718494653701782, "learning_rate": 0.0002, "epoch": 3.759966428871171, "step": 4480}, {"loss": 1.5612, "grad_norm": 0.5404117703437805, "learning_rate": 0.0002, "epoch": 3.7683592110784723, "step": 4490}, {"loss": 1.4707, "grad_norm": 0.5816529393196106, "learning_rate": 0.0002, "epoch": 3.7767519932857745, "step": 4500}, {"loss": 1.5802, "grad_norm": 0.6314901113510132, "learning_rate": 0.0002, "epoch": 3.785144775493076, "step": 4510}, {"loss": 1.5445, "grad_norm": 0.7639698386192322, "learning_rate": 0.0002, "epoch": 3.7935375577003776, "step": 4520}, {"loss": 1.5718, "grad_norm": 0.5727366209030151, "learning_rate": 0.0002, "epoch": 3.8019303399076794, "step": 4530}, {"loss": 1.5409, "grad_norm": 0.6467128396034241, "learning_rate": 0.0002, "epoch": 3.810323122114981, "step": 4540}, {"loss": 1.5266, "grad_norm": 0.6572837233543396, "learning_rate": 0.0002, "epoch": 3.818715904322283, "step": 4550}, {"loss": 1.5718, "grad_norm": 0.5847418904304504, "learning_rate": 0.0002, "epoch": 3.8271086865295847, "step": 4560}, {"loss": 1.5303, "grad_norm": 0.48820871114730835, "learning_rate": 0.0002, "epoch": 3.8355014687368865, "step": 4570}, {"loss": 1.4911, "grad_norm": 1.2537429332733154, "learning_rate": 0.0002, "epoch": 3.843894250944188, "step": 4580}, {"loss": 1.5522, "grad_norm": 0.6026989221572876, "learning_rate": 0.0002, "epoch": 3.8522870331514896, "step": 4590}, {"loss": 1.5035, "grad_norm": 0.5541417598724365, "learning_rate": 0.0002, "epoch": 3.8606798153587913, "step": 4600}, {"loss": 1.5238, "grad_norm": 0.7668771147727966, "learning_rate": 0.0002, "epoch": 3.869072597566093, "step": 4610}, {"loss": 1.5428, "grad_norm": 0.6181227564811707, "learning_rate": 0.0002, "epoch": 3.877465379773395, "step": 4620}, {"loss": 1.5242, "grad_norm": 0.5842700004577637, "learning_rate": 0.0002, "epoch": 3.8858581619806967, "step": 4630}, {"loss": 1.5501, "grad_norm": 0.5824751257896423, "learning_rate": 0.0002, "epoch": 3.8942509441879984, "step": 4640}, {"loss": 1.4443, "grad_norm": 0.6212735772132874, "learning_rate": 0.0002, "epoch": 3.9026437263952998, "step": 4650}, {"loss": 1.4972, "grad_norm": 0.6123346090316772, "learning_rate": 0.0002, "epoch": 3.911036508602602, "step": 4660}, {"loss": 1.5531, "grad_norm": 0.518662691116333, "learning_rate": 0.0002, "epoch": 3.9194292908099033, "step": 4670}, {"loss": 1.5151, "grad_norm": 0.6963476538658142, "learning_rate": 0.0002, "epoch": 3.927822073017205, "step": 4680}, {"loss": 1.5826, "grad_norm": 0.5192152261734009, "learning_rate": 0.0002, "epoch": 3.936214855224507, "step": 4690}, {"loss": 1.5312, "grad_norm": 0.5820888876914978, "learning_rate": 0.0002, "epoch": 3.9446076374318086, "step": 4700}, {"loss": 1.527, "grad_norm": 0.6320387721061707, "learning_rate": 0.0002, "epoch": 3.9530004196391104, "step": 4710}, {"loss": 1.6006, "grad_norm": 0.6174548268318176, "learning_rate": 0.0002, "epoch": 3.961393201846412, "step": 4720}, {"loss": 1.5581, "grad_norm": 0.6691966652870178, "learning_rate": 0.0002, "epoch": 3.969785984053714, "step": 4730}, {"loss": 1.4762, "grad_norm": 0.5972068309783936, "learning_rate": 0.0002, "epoch": 3.9781787662610153, "step": 4740}, {"loss": 1.4947, "grad_norm": 0.5759536027908325, "learning_rate": 0.0002, "epoch": 3.9865715484683175, "step": 4750}, {"loss": 1.4836, "grad_norm": 0.5886756777763367, "learning_rate": 0.0002, "epoch": 3.994964330675619, "step": 4760}, {"eval_loss": 1.8749940395355225, "eval_runtime": 38.037, "eval_samples_per_second": 13.539, "eval_steps_per_second": 1.709, "epoch": 4.0, "step": 4766}, {"loss": 1.5259, "grad_norm": 0.5915011167526245, "learning_rate": 0.0002, "epoch": 4.003357112882921, "step": 4770}, {"loss": 1.4071, "grad_norm": 0.8565000891685486, "learning_rate": 0.0002, "epoch": 4.011749895090222, "step": 4780}, {"loss": 1.3211, "grad_norm": 0.7753950953483582, "learning_rate": 0.0002, "epoch": 4.020142677297524, "step": 4790}, {"loss": 1.3607, "grad_norm": 0.6837254166603088, "learning_rate": 0.0002, "epoch": 4.028535459504826, "step": 4800}, {"loss": 1.3275, "grad_norm": 0.8374526500701904, "learning_rate": 0.0002, "epoch": 4.036928241712127, "step": 4810}, {"loss": 1.3579, "grad_norm": 0.8717963099479675, "learning_rate": 0.0002, "epoch": 4.0453210239194295, "step": 4820}, {"loss": 1.3374, "grad_norm": 0.7002043724060059, "learning_rate": 0.0002, "epoch": 4.053713806126731, "step": 4830}, {"loss": 1.3882, "grad_norm": 1.0319572687149048, "learning_rate": 0.0002, "epoch": 4.062106588334033, "step": 4840}, {"loss": 1.3291, "grad_norm": 0.6746882200241089, "learning_rate": 0.0002, "epoch": 4.070499370541334, "step": 4850}, {"loss": 1.339, "grad_norm": 0.8187578320503235, "learning_rate": 0.0002, "epoch": 4.078892152748637, "step": 4860}, {"loss": 1.368, "grad_norm": 0.7888399362564087, "learning_rate": 0.0002, "epoch": 4.087284934955938, "step": 4870}, {"loss": 1.4115, "grad_norm": 0.7149351239204407, "learning_rate": 0.0002, "epoch": 4.095677717163239, "step": 4880}, {"loss": 1.341, "grad_norm": 0.9067983031272888, "learning_rate": 0.0002, "epoch": 4.1040704993705415, "step": 4890}, {"loss": 1.4084, "grad_norm": 0.771186351776123, "learning_rate": 0.0002, "epoch": 4.112463281577843, "step": 4900}, {"loss": 1.2722, "grad_norm": 0.7756485342979431, "learning_rate": 0.0002, "epoch": 4.120856063785145, "step": 4910}, {"loss": 1.4138, "grad_norm": 0.7149116396903992, "learning_rate": 0.0002, "epoch": 4.129248845992446, "step": 4920}, {"loss": 1.3102, "grad_norm": 0.700442910194397, "learning_rate": 0.0002, "epoch": 4.137641628199749, "step": 4930}, {"loss": 1.3628, "grad_norm": 0.8439189195632935, "learning_rate": 0.0002, "epoch": 4.14603441040705, "step": 4940}, {"loss": 1.3511, "grad_norm": 0.6570779085159302, "learning_rate": 0.0002, "epoch": 4.154427192614351, "step": 4950}, {"loss": 1.3955, "grad_norm": 0.886482298374176, "learning_rate": 0.0002, "epoch": 4.1628199748216534, "step": 4960}, {"loss": 1.4083, "grad_norm": 0.7220938801765442, "learning_rate": 0.0002, "epoch": 4.171212757028955, "step": 4970}, {"loss": 1.3611, "grad_norm": 0.7185905575752258, "learning_rate": 0.0002, "epoch": 4.179605539236257, "step": 4980}, {"loss": 1.3623, "grad_norm": 0.7566333413124084, "learning_rate": 0.0002, "epoch": 4.187998321443558, "step": 4990}, {"loss": 1.2771, "grad_norm": 0.6960445642471313, "learning_rate": 0.0002, "epoch": 4.1963911036508605, "step": 5000}, {"loss": 1.3565, "grad_norm": 0.7727336883544922, "learning_rate": 0.0002, "epoch": 4.204783885858162, "step": 5010}, {"loss": 1.4156, "grad_norm": 0.8038365244865417, "learning_rate": 0.0002, "epoch": 4.213176668065464, "step": 5020}, {"loss": 1.3849, "grad_norm": 0.7587628364562988, "learning_rate": 0.0002, "epoch": 4.221569450272765, "step": 5030}, {"loss": 1.4047, "grad_norm": 0.928032398223877, "learning_rate": 0.0002, "epoch": 4.229962232480067, "step": 5040}, {"loss": 1.3768, "grad_norm": 0.7168642282485962, "learning_rate": 0.0002, "epoch": 4.238355014687369, "step": 5050}, {"loss": 1.3767, "grad_norm": 0.7981422543525696, "learning_rate": 0.0002, "epoch": 4.24674779689467, "step": 5060}, {"loss": 1.406, "grad_norm": 0.6951150894165039, "learning_rate": 0.0002, "epoch": 4.2551405791019725, "step": 5070}, {"loss": 1.3776, "grad_norm": 0.7337371706962585, "learning_rate": 0.0002, "epoch": 4.263533361309274, "step": 5080}, {"loss": 1.3425, "grad_norm": 0.8367464542388916, "learning_rate": 0.0002, "epoch": 4.271926143516576, "step": 5090}, {"loss": 1.3823, "grad_norm": 0.6744083166122437, "learning_rate": 0.0002, "epoch": 4.280318925723877, "step": 5100}, {"loss": 1.4183, "grad_norm": 0.9072301387786865, "learning_rate": 0.0002, "epoch": 4.28871170793118, "step": 5110}, {"loss": 1.4219, "grad_norm": 0.7703930735588074, "learning_rate": 0.0002, "epoch": 4.297104490138481, "step": 5120}, {"loss": 1.3658, "grad_norm": 0.6734083294868469, "learning_rate": 0.0002, "epoch": 4.305497272345782, "step": 5130}, {"loss": 1.441, "grad_norm": 0.7835540175437927, "learning_rate": 0.0002, "epoch": 4.3138900545530845, "step": 5140}, {"loss": 1.384, "grad_norm": 1.0822200775146484, "learning_rate": 0.0002, "epoch": 4.322282836760386, "step": 5150}, {"loss": 1.4167, "grad_norm": 0.8432536721229553, "learning_rate": 0.0002, "epoch": 4.330675618967688, "step": 5160}, {"loss": 1.3796, "grad_norm": 0.6739283800125122, "learning_rate": 0.0002, "epoch": 4.339068401174989, "step": 5170}, {"loss": 1.3651, "grad_norm": 0.7395278811454773, "learning_rate": 0.0002, "epoch": 4.347461183382292, "step": 5180}, {"loss": 1.3258, "grad_norm": 0.7638891339302063, "learning_rate": 0.0002, "epoch": 4.355853965589593, "step": 5190}, {"loss": 1.34, "grad_norm": 1.1222662925720215, "learning_rate": 0.0002, "epoch": 4.364246747796894, "step": 5200}, {"loss": 1.3757, "grad_norm": 0.9102525115013123, "learning_rate": 0.0002, "epoch": 4.3726395300041965, "step": 5210}, {"loss": 1.413, "grad_norm": 0.7181593775749207, "learning_rate": 0.0002, "epoch": 4.381032312211498, "step": 5220}, {"loss": 1.3808, "grad_norm": 0.7813979387283325, "learning_rate": 0.0002, "epoch": 4.3894250944188, "step": 5230}, {"loss": 1.423, "grad_norm": 0.8906185626983643, "learning_rate": 0.0002, "epoch": 4.397817876626101, "step": 5240}, {"loss": 1.3901, "grad_norm": 0.7456443309783936, "learning_rate": 0.0002, "epoch": 4.406210658833404, "step": 5250}, {"loss": 1.3292, "grad_norm": 0.8752070069313049, "learning_rate": 0.0002, "epoch": 4.414603441040705, "step": 5260}, {"loss": 1.3351, "grad_norm": 0.9560954570770264, "learning_rate": 0.0002, "epoch": 4.422996223248007, "step": 5270}, {"loss": 1.3708, "grad_norm": 0.7227762341499329, "learning_rate": 0.0002, "epoch": 4.4313890054553084, "step": 5280}, {"loss": 1.4281, "grad_norm": 0.8141599893569946, "learning_rate": 0.0002, "epoch": 4.43978178766261, "step": 5290}, {"loss": 1.381, "grad_norm": 0.928382158279419, "learning_rate": 0.0002, "epoch": 4.448174569869912, "step": 5300}, {"loss": 1.3586, "grad_norm": 0.7719997763633728, "learning_rate": 0.0002, "epoch": 4.456567352077213, "step": 5310}, {"loss": 1.3652, "grad_norm": 0.8081879615783691, "learning_rate": 0.0002, "epoch": 4.4649601342845155, "step": 5320}, {"loss": 1.4121, "grad_norm": 0.7903412580490112, "learning_rate": 0.0002, "epoch": 4.473352916491817, "step": 5330}, {"loss": 1.4453, "grad_norm": 0.7751287221908569, "learning_rate": 0.0002, "epoch": 4.481745698699119, "step": 5340}, {"loss": 1.392, "grad_norm": 0.8287544250488281, "learning_rate": 0.0002, "epoch": 4.49013848090642, "step": 5350}, {"loss": 1.3841, "grad_norm": 0.7431012392044067, "learning_rate": 0.0002, "epoch": 4.498531263113723, "step": 5360}, {"loss": 1.3843, "grad_norm": 0.8648661971092224, "learning_rate": 0.0002, "epoch": 4.506924045321024, "step": 5370}, {"loss": 1.3742, "grad_norm": 0.9314997792243958, "learning_rate": 0.0002, "epoch": 4.515316827528325, "step": 5380}, {"loss": 1.354, "grad_norm": 0.7530864477157593, "learning_rate": 0.0002, "epoch": 4.5237096097356275, "step": 5390}, {"loss": 1.4159, "grad_norm": 0.8739821910858154, "learning_rate": 0.0002, "epoch": 4.532102391942929, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.8090344667434692, "learning_rate": 0.0002, "epoch": 4.540495174150231, "step": 5410}, {"loss": 1.4187, "grad_norm": 0.7530879974365234, "learning_rate": 0.0002, "epoch": 4.548887956357532, "step": 5420}, {"loss": 1.47, "grad_norm": 0.8787251114845276, "learning_rate": 0.0002, "epoch": 4.557280738564835, "step": 5430}, {"loss": 1.375, "grad_norm": 0.813961923122406, "learning_rate": 0.0002, "epoch": 4.565673520772136, "step": 5440}, {"loss": 1.4475, "grad_norm": 0.7778232097625732, "learning_rate": 0.0002, "epoch": 4.574066302979437, "step": 5450}, {"loss": 1.4421, "grad_norm": 0.7323020696640015, "learning_rate": 0.0002, "epoch": 4.5824590851867395, "step": 5460}, {"loss": 1.396, "grad_norm": 0.7826765179634094, "learning_rate": 0.0002, "epoch": 4.590851867394041, "step": 5470}, {"loss": 1.4068, "grad_norm": 0.7245969772338867, "learning_rate": 0.0002, "epoch": 4.599244649601343, "step": 5480}, {"loss": 1.4276, "grad_norm": 0.7697308659553528, "learning_rate": 0.0002, "epoch": 4.607637431808644, "step": 5490}, {"loss": 1.3849, "grad_norm": 0.8053571581840515, "learning_rate": 0.0002, "epoch": 4.616030214015947, "step": 5500}, {"loss": 1.4225, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.624422996223248, "step": 5510}, {"loss": 1.3771, "grad_norm": 0.7398585677146912, "learning_rate": 0.0002, "epoch": 4.632815778430549, "step": 5520}, {"loss": 1.4216, "grad_norm": 0.7896319031715393, "learning_rate": 0.0002, "epoch": 4.6412085606378515, "step": 5530}, {"loss": 1.4199, "grad_norm": 0.8290980458259583, "learning_rate": 0.0002, "epoch": 4.649601342845153, "step": 5540}, {"loss": 1.463, "grad_norm": 0.8232647776603699, "learning_rate": 0.0002, "epoch": 4.657994125052455, "step": 5550}, {"loss": 1.3925, "grad_norm": 0.9154987335205078, "learning_rate": 0.0002, "epoch": 4.666386907259756, "step": 5560}, {"loss": 1.3674, "grad_norm": 0.8400886654853821, "learning_rate": 0.0002, "epoch": 4.674779689467059, "step": 5570}, {"loss": 1.379, "grad_norm": 0.7312718629837036, "learning_rate": 0.0002, "epoch": 4.68317247167436, "step": 5580}, {"loss": 1.3925, "grad_norm": 0.8043803572654724, "learning_rate": 0.0002, "epoch": 4.691565253881662, "step": 5590}, {"loss": 1.3952, "grad_norm": 0.7966225147247314, "learning_rate": 0.0002, "epoch": 4.6999580360889635, "step": 5600}, {"loss": 1.3429, "grad_norm": 0.881574809551239, "learning_rate": 0.0002, "epoch": 4.708350818296266, "step": 5610}, {"loss": 1.4444, "grad_norm": 0.7252084016799927, "learning_rate": 0.0002, "epoch": 4.716743600503567, "step": 5620}, {"loss": 1.3566, "grad_norm": 0.7726518511772156, "learning_rate": 0.0002, "epoch": 4.725136382710868, "step": 5630}, {"loss": 1.3954, "grad_norm": 0.7306379079818726, "learning_rate": 0.0002, "epoch": 4.7335291649181706, "step": 5640}, {"loss": 1.4385, "grad_norm": 0.8029969334602356, "learning_rate": 0.0002, "epoch": 4.741921947125472, "step": 5650}, {"loss": 1.3966, "grad_norm": 0.9103893637657166, "learning_rate": 0.0002, "epoch": 4.750314729332774, "step": 5660}, {"loss": 1.4026, "grad_norm": 0.8783416748046875, "learning_rate": 0.0002, "epoch": 4.758707511540075, "step": 5670}, {"loss": 1.3427, "grad_norm": 0.6807119846343994, "learning_rate": 0.0002, "epoch": 4.767100293747378, "step": 5680}, {"loss": 1.4148, "grad_norm": 0.7103772759437561, "learning_rate": 0.0002, "epoch": 4.775493075954679, "step": 5690}, {"loss": 1.4079, "grad_norm": 0.8472093343734741, "learning_rate": 0.0002, "epoch": 4.78388585816198, "step": 5700}, {"loss": 1.3937, "grad_norm": 0.851847231388092, "learning_rate": 0.0002, "epoch": 4.7922786403692825, "step": 5710}, {"loss": 1.3965, "grad_norm": 0.9084636569023132, "learning_rate": 0.0002, "epoch": 4.800671422576584, "step": 5720}, {"loss": 1.4358, "grad_norm": 0.7628585696220398, "learning_rate": 0.0002, "epoch": 4.809064204783886, "step": 5730}, {"loss": 1.3746, "grad_norm": 0.775580883026123, "learning_rate": 0.0002, "epoch": 4.817456986991187, "step": 5740}, {"loss": 1.4573, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 4.82584976919849, "step": 5750}, {"loss": 1.3991, "grad_norm": 0.7021728754043579, "learning_rate": 0.0002, "epoch": 4.834242551405791, "step": 5760}, {"loss": 1.4012, "grad_norm": 0.7810541391372681, "learning_rate": 0.0002, "epoch": 4.842635333613092, "step": 5770}, {"loss": 1.396, "grad_norm": 0.7290041446685791, "learning_rate": 0.0002, "epoch": 4.8510281158203945, "step": 5780}, {"loss": 1.4769, "grad_norm": 0.9059709906578064, "learning_rate": 0.0002, "epoch": 4.859420898027696, "step": 5790}, {"loss": 1.4091, "grad_norm": 0.8338062167167664, "learning_rate": 0.0002, "epoch": 4.867813680234998, "step": 5800}, {"loss": 1.395, "grad_norm": 0.830926775932312, "learning_rate": 0.0002, "epoch": 4.876206462442299, "step": 5810}, {"loss": 1.4261, "grad_norm": 0.7818633317947388, "learning_rate": 0.0002, "epoch": 4.884599244649602, "step": 5820}, {"loss": 1.4252, "grad_norm": 0.8143376708030701, "learning_rate": 0.0002, "epoch": 4.892992026856903, "step": 5830}, {"loss": 1.3583, "grad_norm": 0.7754496335983276, "learning_rate": 0.0002, "epoch": 4.901384809064205, "step": 5840}, {"loss": 1.4036, "grad_norm": 0.7154468297958374, "learning_rate": 0.0002, "epoch": 4.9097775912715065, "step": 5850}, {"loss": 1.3909, "grad_norm": 0.6829783916473389, "learning_rate": 0.0002, "epoch": 4.918170373478809, "step": 5860}, {"loss": 1.3854, "grad_norm": 0.784919261932373, "learning_rate": 0.0002, "epoch": 4.92656315568611, "step": 5870}, {"loss": 1.4277, "grad_norm": 0.8168354034423828, "learning_rate": 0.0002, "epoch": 4.934955937893411, "step": 5880}, {"loss": 1.3694, "grad_norm": 0.7356618642807007, "learning_rate": 0.0002, "epoch": 4.943348720100714, "step": 5890}, {"loss": 1.4827, "grad_norm": 0.7399224042892456, "learning_rate": 0.0002, "epoch": 4.951741502308015, "step": 5900}, {"loss": 1.3643, "grad_norm": 0.7430436015129089, "learning_rate": 0.0002, "epoch": 4.960134284515317, "step": 5910}, {"loss": 1.3836, "grad_norm": 0.7587705850601196, "learning_rate": 0.0002, "epoch": 4.9685270667226185, "step": 5920}, {"loss": 1.4162, "grad_norm": 0.9103638529777527, "learning_rate": 0.0002, "epoch": 4.976919848929921, "step": 5930}, {"loss": 1.4688, "grad_norm": 0.7357394695281982, "learning_rate": 0.0002, "epoch": 4.985312631137222, "step": 5940}, {"loss": 1.3988, "grad_norm": 0.7371547222137451, "learning_rate": 0.0002, "epoch": 4.993705413344523, "step": 5950}]} +{"epoch": 6.0, "step": 7149, "epoch_duration": 1279.4778280258179, "total_accumulated_duration": 7747.131546497345, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}, {"eval_loss": 1.8086129426956177, "eval_runtime": 38.0431, "eval_samples_per_second": 13.537, "eval_steps_per_second": 1.709, "epoch": 0.9995803608896349, "step": 1191}, {"loss": 1.6856, "grad_norm": 0.3235187232494354, "learning_rate": 0.0002, "epoch": 1.0071338648762065, "step": 1200}, {"loss": 1.7121, "grad_norm": 0.34884774684906006, "learning_rate": 0.0002, "epoch": 1.0155266470835083, "step": 1210}, {"loss": 1.6779, "grad_norm": 0.3215438425540924, "learning_rate": 0.0002, "epoch": 1.0239194292908098, "step": 1220}, {"loss": 1.6562, "grad_norm": 0.312084823846817, "learning_rate": 0.0002, "epoch": 1.0323122114981116, "step": 1230}, {"loss": 1.7366, "grad_norm": 0.33597758412361145, "learning_rate": 0.0002, "epoch": 1.0407049937054134, "step": 1240}, {"loss": 1.7245, "grad_norm": 0.3421499729156494, "learning_rate": 0.0002, "epoch": 1.0490977759127151, "step": 1250}, {"loss": 1.7331, "grad_norm": 0.3458889126777649, "learning_rate": 0.0002, "epoch": 1.0574905581200167, "step": 1260}, {"loss": 1.6929, "grad_norm": 0.3956579864025116, "learning_rate": 0.0002, "epoch": 1.0658833403273185, "step": 1270}, {"loss": 1.6625, "grad_norm": 0.3217819035053253, "learning_rate": 0.0002, "epoch": 1.0742761225346202, "step": 1280}, {"loss": 1.7488, "grad_norm": 0.31379663944244385, "learning_rate": 0.0002, "epoch": 1.082668904741922, "step": 1290}, {"loss": 1.6331, "grad_norm": 0.37231558561325073, "learning_rate": 0.0002, "epoch": 1.0910616869492236, "step": 1300}, {"loss": 1.6614, "grad_norm": 0.35857918858528137, "learning_rate": 0.0002, "epoch": 1.0994544691565253, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.36637991666793823, "learning_rate": 0.0002, "epoch": 1.1078472513638271, "step": 1320}, {"loss": 1.7245, "grad_norm": 0.3436494469642639, "learning_rate": 0.0002, "epoch": 1.1162400335711289, "step": 1330}, {"loss": 1.6867, "grad_norm": 0.404908150434494, "learning_rate": 0.0002, "epoch": 1.1246328157784307, "step": 1340}, {"loss": 1.7042, "grad_norm": 0.34587544202804565, "learning_rate": 0.0002, "epoch": 1.1330255979857322, "step": 1350}, {"loss": 1.6365, "grad_norm": 0.35142362117767334, "learning_rate": 0.0002, "epoch": 1.141418380193034, "step": 1360}, {"loss": 1.6781, "grad_norm": 0.3511804938316345, "learning_rate": 0.0002, "epoch": 1.1498111624003358, "step": 1370}, {"loss": 1.6824, "grad_norm": 0.3549560308456421, "learning_rate": 0.0002, "epoch": 1.1582039446076373, "step": 1380}, {"loss": 1.7276, "grad_norm": 0.35797521471977234, "learning_rate": 0.0002, "epoch": 1.166596726814939, "step": 1390}, {"loss": 1.7476, "grad_norm": 0.37255269289016724, "learning_rate": 0.0002, "epoch": 1.1749895090222409, "step": 1400}, {"loss": 1.7274, "grad_norm": 0.3680652379989624, "learning_rate": 0.0002, "epoch": 1.1833822912295426, "step": 1410}, {"loss": 1.6751, "grad_norm": 0.400831013917923, "learning_rate": 0.0002, "epoch": 1.1917750734368444, "step": 1420}, {"loss": 1.7961, "grad_norm": 0.39571020007133484, "learning_rate": 0.0002, "epoch": 1.200167855644146, "step": 1430}, {"loss": 1.792, "grad_norm": 0.3843863010406494, "learning_rate": 0.0002, "epoch": 1.2085606378514477, "step": 1440}, {"loss": 1.7072, "grad_norm": 0.3901960551738739, "learning_rate": 0.0002, "epoch": 1.2169534200587495, "step": 1450}, {"loss": 1.6425, "grad_norm": 0.36490726470947266, "learning_rate": 0.0002, "epoch": 1.2253462022660513, "step": 1460}, {"loss": 1.6995, "grad_norm": 0.3739864230155945, "learning_rate": 0.0002, "epoch": 1.2337389844733528, "step": 1470}, {"loss": 1.6795, "grad_norm": 0.39061254262924194, "learning_rate": 0.0002, "epoch": 1.2421317666806546, "step": 1480}, {"loss": 1.6838, "grad_norm": 0.37198659777641296, "learning_rate": 0.0002, "epoch": 1.2505245488879564, "step": 1490}, {"loss": 1.725, "grad_norm": 0.3420586884021759, "learning_rate": 0.0002, "epoch": 1.2589173310952582, "step": 1500}, {"loss": 1.719, "grad_norm": 0.4094347655773163, "learning_rate": 0.0002, "epoch": 1.2673101133025597, "step": 1510}, {"loss": 1.7563, "grad_norm": 0.38997703790664673, "learning_rate": 0.0002, "epoch": 1.2757028955098615, "step": 1520}, {"loss": 1.6651, "grad_norm": 0.35702022910118103, "learning_rate": 0.0002, "epoch": 1.2840956777171633, "step": 1530}, {"loss": 1.6689, "grad_norm": 0.3892163336277008, "learning_rate": 0.0002, "epoch": 1.292488459924465, "step": 1540}, {"loss": 1.7209, "grad_norm": 0.33174318075180054, "learning_rate": 0.0002, "epoch": 1.3008812421317666, "step": 1550}, {"loss": 1.7581, "grad_norm": 0.40701809525489807, "learning_rate": 0.0002, "epoch": 1.3092740243390684, "step": 1560}, {"loss": 1.7229, "grad_norm": 0.36324232816696167, "learning_rate": 0.0002, "epoch": 1.3176668065463701, "step": 1570}, {"loss": 1.6708, "grad_norm": 0.3748789429664612, "learning_rate": 0.0002, "epoch": 1.326059588753672, "step": 1580}, {"loss": 1.67, "grad_norm": 0.40873438119888306, "learning_rate": 0.0002, "epoch": 1.3344523709609737, "step": 1590}, {"loss": 1.7909, "grad_norm": 0.52373206615448, "learning_rate": 0.0002, "epoch": 1.3428451531682752, "step": 1600}, {"loss": 1.7593, "grad_norm": 0.40408164262771606, "learning_rate": 0.0002, "epoch": 1.351237935375577, "step": 1610}, {"loss": 1.7959, "grad_norm": 0.3818126320838928, "learning_rate": 0.0002, "epoch": 1.3596307175828788, "step": 1620}, {"loss": 1.6328, "grad_norm": 0.3457068204879761, "learning_rate": 0.0002, "epoch": 1.3680234997901803, "step": 1630}, {"loss": 1.7017, "grad_norm": 0.33777865767478943, "learning_rate": 0.0002, "epoch": 1.3764162819974821, "step": 1640}, {"loss": 1.7335, "grad_norm": 0.36344218254089355, "learning_rate": 0.0002, "epoch": 1.384809064204784, "step": 1650}, {"loss": 1.7656, "grad_norm": 0.3880128562450409, "learning_rate": 0.0002, "epoch": 1.3932018464120857, "step": 1660}, {"loss": 1.7377, "grad_norm": 0.3906225562095642, "learning_rate": 0.0002, "epoch": 1.4015946286193874, "step": 1670}, {"loss": 1.7041, "grad_norm": 0.35857489705085754, "learning_rate": 0.0002, "epoch": 1.409987410826689, "step": 1680}, {"loss": 1.7175, "grad_norm": 0.3627418279647827, "learning_rate": 0.0002, "epoch": 1.4183801930339908, "step": 1690}, {"loss": 1.6948, "grad_norm": 0.41963326930999756, "learning_rate": 0.0002, "epoch": 1.4267729752412925, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.36280378699302673, "learning_rate": 0.0002, "epoch": 1.435165757448594, "step": 1710}, {"loss": 1.7775, "grad_norm": 0.3868233561515808, "learning_rate": 0.0002, "epoch": 1.4435585396558959, "step": 1720}, {"loss": 1.6963, "grad_norm": 0.3635849356651306, "learning_rate": 0.0002, "epoch": 1.4519513218631976, "step": 1730}, {"loss": 1.7381, "grad_norm": 0.4885194003582001, "learning_rate": 0.0002, "epoch": 1.4603441040704994, "step": 1740}, {"loss": 1.6661, "grad_norm": 0.35194680094718933, "learning_rate": 0.0002, "epoch": 1.4687368862778012, "step": 1750}, {"loss": 1.7841, "grad_norm": 0.34906691312789917, "learning_rate": 0.0002, "epoch": 1.4771296684851027, "step": 1760}, {"loss": 1.7196, "grad_norm": 0.3994184732437134, "learning_rate": 0.0002, "epoch": 1.4855224506924045, "step": 1770}, {"loss": 1.7157, "grad_norm": 0.3599298298358917, "learning_rate": 0.0002, "epoch": 1.4939152328997063, "step": 1780}, {"loss": 1.6966, "grad_norm": 0.3794984221458435, "learning_rate": 0.0002, "epoch": 1.5023080151070078, "step": 1790}, {"loss": 1.7187, "grad_norm": 0.36289724707603455, "learning_rate": 0.0002, "epoch": 1.5107007973143096, "step": 1800}, {"loss": 1.78, "grad_norm": 0.38057321310043335, "learning_rate": 0.0002, "epoch": 1.5190935795216114, "step": 1810}, {"loss": 1.7006, "grad_norm": 0.3771969676017761, "learning_rate": 0.0002, "epoch": 1.5274863617289132, "step": 1820}, {"loss": 1.765, "grad_norm": 0.34788841009140015, "learning_rate": 0.0002, "epoch": 1.535879143936215, "step": 1830}, {"loss": 1.7148, "grad_norm": 0.41352227330207825, "learning_rate": 0.0002, "epoch": 1.5442719261435167, "step": 1840}, {"loss": 1.6654, "grad_norm": 0.35711410641670227, "learning_rate": 0.0002, "epoch": 1.5526647083508183, "step": 1850}, {"loss": 1.6998, "grad_norm": 0.40607622265815735, "learning_rate": 0.0002, "epoch": 1.56105749055812, "step": 1860}, {"loss": 1.713, "grad_norm": 0.3428550660610199, "learning_rate": 0.0002, "epoch": 1.5694502727654216, "step": 1870}, {"loss": 1.7909, "grad_norm": 0.3695414066314697, "learning_rate": 0.0002, "epoch": 1.5778430549727234, "step": 1880}, {"loss": 1.6629, "grad_norm": 0.3798272907733917, "learning_rate": 0.0002, "epoch": 1.5862358371800251, "step": 1890}, {"loss": 1.7412, "grad_norm": 0.3415829837322235, "learning_rate": 0.0002, "epoch": 1.594628619387327, "step": 1900}, {"loss": 1.8233, "grad_norm": 0.3575693666934967, "learning_rate": 0.0002, "epoch": 1.6030214015946287, "step": 1910}, {"loss": 1.6947, "grad_norm": 0.3180370628833771, "learning_rate": 0.0002, "epoch": 1.6114141838019305, "step": 1920}, {"loss": 1.7506, "grad_norm": 0.5018689036369324, "learning_rate": 0.0002, "epoch": 1.619806966009232, "step": 1930}, {"loss": 1.7368, "grad_norm": 0.35676372051239014, "learning_rate": 0.0002, "epoch": 1.6281997482165338, "step": 1940}, {"loss": 1.7159, "grad_norm": 0.3740452229976654, "learning_rate": 0.0002, "epoch": 1.6365925304238353, "step": 1950}, {"loss": 1.6474, "grad_norm": 0.36584731936454773, "learning_rate": 0.0002, "epoch": 1.6449853126311371, "step": 1960}, {"loss": 1.7306, "grad_norm": 0.38556376099586487, "learning_rate": 0.0002, "epoch": 1.653378094838439, "step": 1970}, {"loss": 1.7694, "grad_norm": 0.4114968776702881, "learning_rate": 0.0002, "epoch": 1.6617708770457407, "step": 1980}, {"loss": 1.6407, "grad_norm": 0.3665498197078705, "learning_rate": 0.0002, "epoch": 1.6701636592530424, "step": 1990}, {"loss": 1.7167, "grad_norm": 0.36579379439353943, "learning_rate": 0.0002, "epoch": 1.6785564414603442, "step": 2000}, {"loss": 1.7637, "grad_norm": 0.3813064694404602, "learning_rate": 0.0002, "epoch": 1.6869492236676458, "step": 2010}, {"loss": 1.7566, "grad_norm": 0.33390694856643677, "learning_rate": 0.0002, "epoch": 1.6953420058749475, "step": 2020}, {"loss": 1.6576, "grad_norm": 0.3668614327907562, "learning_rate": 0.0002, "epoch": 1.7037347880822493, "step": 2030}, {"loss": 1.7162, "grad_norm": 0.352028489112854, "learning_rate": 0.0002, "epoch": 1.7121275702895509, "step": 2040}, {"loss": 1.727, "grad_norm": 0.33639830350875854, "learning_rate": 0.0002, "epoch": 1.7205203524968526, "step": 2050}, {"loss": 1.7868, "grad_norm": 0.39217695593833923, "learning_rate": 0.0002, "epoch": 1.7289131347041544, "step": 2060}, {"loss": 1.7608, "grad_norm": 0.42593324184417725, "learning_rate": 0.0002, "epoch": 1.7373059169114562, "step": 2070}, {"loss": 1.722, "grad_norm": 0.362215518951416, "learning_rate": 0.0002, "epoch": 1.745698699118758, "step": 2080}, {"loss": 1.7712, "grad_norm": 0.4087955057621002, "learning_rate": 0.0002, "epoch": 1.7540914813260597, "step": 2090}, {"loss": 1.6414, "grad_norm": 0.35127750039100647, "learning_rate": 0.0002, "epoch": 1.7624842635333613, "step": 2100}, {"loss": 1.7405, "grad_norm": 0.33677494525909424, "learning_rate": 0.0002, "epoch": 1.770877045740663, "step": 2110}, {"loss": 1.7478, "grad_norm": 0.39616644382476807, "learning_rate": 0.0002, "epoch": 1.7792698279479646, "step": 2120}, {"loss": 1.8068, "grad_norm": 0.4705100953578949, "learning_rate": 0.0002, "epoch": 1.7876626101552664, "step": 2130}, {"loss": 1.75, "grad_norm": 0.3893914818763733, "learning_rate": 0.0002, "epoch": 1.7960553923625682, "step": 2140}, {"loss": 1.6711, "grad_norm": 0.3344813585281372, "learning_rate": 0.0002, "epoch": 1.80444817456987, "step": 2150}, {"loss": 1.8329, "grad_norm": 0.36502110958099365, "learning_rate": 0.0002, "epoch": 1.8128409567771717, "step": 2160}, {"loss": 1.753, "grad_norm": 0.3422985374927521, "learning_rate": 0.0002, "epoch": 1.8212337389844735, "step": 2170}, {"loss": 1.6874, "grad_norm": 0.44039851427078247, "learning_rate": 0.0002, "epoch": 1.829626521191775, "step": 2180}, {"loss": 1.7706, "grad_norm": 0.40052926540374756, "learning_rate": 0.0002, "epoch": 1.8380193033990768, "step": 2190}, {"loss": 1.7551, "grad_norm": 0.3614487648010254, "learning_rate": 0.0002, "epoch": 1.8464120856063784, "step": 2200}, {"loss": 1.6879, "grad_norm": 0.3800305426120758, "learning_rate": 0.0002, "epoch": 1.8548048678136801, "step": 2210}, {"loss": 1.7731, "grad_norm": 0.3942040205001831, "learning_rate": 0.0002, "epoch": 1.863197650020982, "step": 2220}, {"loss": 1.7187, "grad_norm": 0.36896875500679016, "learning_rate": 0.0002, "epoch": 1.8715904322282837, "step": 2230}, {"loss": 1.7371, "grad_norm": 0.3666089177131653, "learning_rate": 0.0002, "epoch": 1.8799832144355855, "step": 2240}, {"loss": 1.7336, "grad_norm": 0.3759142756462097, "learning_rate": 0.0002, "epoch": 1.8883759966428872, "step": 2250}, {"loss": 1.7243, "grad_norm": 0.3711695671081543, "learning_rate": 0.0002, "epoch": 1.8967687788501888, "step": 2260}, {"loss": 1.7052, "grad_norm": 0.37000006437301636, "learning_rate": 0.0002, "epoch": 1.9051615610574906, "step": 2270}, {"loss": 1.7104, "grad_norm": 0.37376025319099426, "learning_rate": 0.0002, "epoch": 1.9135543432647921, "step": 2280}, {"loss": 1.6641, "grad_norm": 0.3794068694114685, "learning_rate": 0.0002, "epoch": 1.921947125472094, "step": 2290}, {"loss": 1.7693, "grad_norm": 0.42530709505081177, "learning_rate": 0.0002, "epoch": 1.9303399076793957, "step": 2300}, {"loss": 1.7871, "grad_norm": 0.3381672203540802, "learning_rate": 0.0002, "epoch": 1.9387326898866974, "step": 2310}, {"loss": 1.7502, "grad_norm": 0.3553236722946167, "learning_rate": 0.0002, "epoch": 1.9471254720939992, "step": 2320}, {"loss": 1.715, "grad_norm": 0.38204774260520935, "learning_rate": 0.0002, "epoch": 1.955518254301301, "step": 2330}, {"loss": 1.7088, "grad_norm": 0.4318946301937103, "learning_rate": 0.0002, "epoch": 1.9639110365086025, "step": 2340}, {"loss": 1.7709, "grad_norm": 0.3563119173049927, "learning_rate": 0.0002, "epoch": 1.9723038187159043, "step": 2350}, {"loss": 1.7083, "grad_norm": 0.362532377243042, "learning_rate": 0.0002, "epoch": 1.980696600923206, "step": 2360}, {"loss": 1.6992, "grad_norm": 0.40200483798980713, "learning_rate": 0.0002, "epoch": 1.9890893831305076, "step": 2370}, {"loss": 1.7622, "grad_norm": 0.37397003173828125, "learning_rate": 0.0002, "epoch": 1.9974821653378094, "step": 2380}, {"eval_loss": 1.807437539100647, "eval_runtime": 38.0038, "eval_samples_per_second": 13.551, "eval_steps_per_second": 1.71, "epoch": 2.0, "step": 2383}, {"loss": 1.579, "grad_norm": 0.3563518226146698, "learning_rate": 0.0002, "epoch": 2.005874947545111, "step": 2390}, {"loss": 1.5467, "grad_norm": 0.3913732171058655, "learning_rate": 0.0002, "epoch": 2.014267729752413, "step": 2400}, {"loss": 1.6202, "grad_norm": 0.3511047661304474, "learning_rate": 0.0002, "epoch": 2.0226605119597147, "step": 2410}, {"loss": 1.599, "grad_norm": 0.3917897641658783, "learning_rate": 0.0002, "epoch": 2.0310532941670165, "step": 2420}, {"loss": 1.663, "grad_norm": 0.36766913533210754, "learning_rate": 0.0002, "epoch": 2.0394460763743183, "step": 2430}, {"loss": 1.5608, "grad_norm": 0.434097021818161, "learning_rate": 0.0002, "epoch": 2.0478388585816196, "step": 2440}, {"loss": 1.6199, "grad_norm": 0.4986756145954132, "learning_rate": 0.0002, "epoch": 2.0562316407889214, "step": 2450}, {"loss": 1.6224, "grad_norm": 0.4377020001411438, "learning_rate": 0.0002, "epoch": 2.064624422996223, "step": 2460}, {"loss": 1.6047, "grad_norm": 0.4412095546722412, "learning_rate": 0.0002, "epoch": 2.073017205203525, "step": 2470}, {"loss": 1.6766, "grad_norm": 0.4463737905025482, "learning_rate": 0.0002, "epoch": 2.0814099874108267, "step": 2480}, {"loss": 1.6666, "grad_norm": 0.4118853211402893, "learning_rate": 0.0002, "epoch": 2.0898027696181285, "step": 2490}, {"loss": 1.6384, "grad_norm": 0.48814308643341064, "learning_rate": 0.0002, "epoch": 2.0981955518254303, "step": 2500}, {"loss": 1.6292, "grad_norm": 0.4263038635253906, "learning_rate": 0.0002, "epoch": 2.106588334032732, "step": 2510}, {"loss": 1.5907, "grad_norm": 0.41060999035835266, "learning_rate": 0.0002, "epoch": 2.1149811162400334, "step": 2520}, {"loss": 1.685, "grad_norm": 0.4699285626411438, "learning_rate": 0.0002, "epoch": 2.123373898447335, "step": 2530}, {"loss": 1.6076, "grad_norm": 0.4321298897266388, "learning_rate": 0.0002, "epoch": 2.131766680654637, "step": 2540}, {"loss": 1.5715, "grad_norm": 0.41544368863105774, "learning_rate": 0.0002, "epoch": 2.1401594628619387, "step": 2550}, {"loss": 1.6717, "grad_norm": 0.4529191851615906, "learning_rate": 0.0002, "epoch": 2.1485522450692405, "step": 2560}, {"loss": 1.7014, "grad_norm": 0.4370215833187103, "learning_rate": 0.0002, "epoch": 2.1569450272765422, "step": 2570}, {"loss": 1.55, "grad_norm": 0.3878629207611084, "learning_rate": 0.0002, "epoch": 2.165337809483844, "step": 2580}, {"loss": 1.6863, "grad_norm": 0.47374191880226135, "learning_rate": 0.0002, "epoch": 2.173730591691146, "step": 2590}, {"loss": 1.6462, "grad_norm": 0.4551556706428528, "learning_rate": 0.0002, "epoch": 2.182123373898447, "step": 2600}, {"loss": 1.6238, "grad_norm": 0.45371633768081665, "learning_rate": 0.0002, "epoch": 2.190516156105749, "step": 2610}, {"loss": 1.6134, "grad_norm": 0.3831859529018402, "learning_rate": 0.0002, "epoch": 2.1989089383130507, "step": 2620}, {"loss": 1.6477, "grad_norm": 0.42436569929122925, "learning_rate": 0.0002, "epoch": 2.2073017205203525, "step": 2630}, {"loss": 1.6512, "grad_norm": 0.4363750219345093, "learning_rate": 0.0002, "epoch": 2.2156945027276542, "step": 2640}, {"loss": 1.6978, "grad_norm": 0.4473390579223633, "learning_rate": 0.0002, "epoch": 2.224087284934956, "step": 2650}, {"loss": 1.6161, "grad_norm": 0.4419533908367157, "learning_rate": 0.0002, "epoch": 2.2324800671422578, "step": 2660}, {"loss": 1.6415, "grad_norm": 0.525901198387146, "learning_rate": 0.0002, "epoch": 2.2408728493495595, "step": 2670}, {"loss": 1.6891, "grad_norm": 0.4345211684703827, "learning_rate": 0.0002, "epoch": 2.2492656315568613, "step": 2680}, {"loss": 1.5951, "grad_norm": 0.5169841051101685, "learning_rate": 0.0002, "epoch": 2.2576584137641627, "step": 2690}, {"loss": 1.6221, "grad_norm": 0.43511003255844116, "learning_rate": 0.0002, "epoch": 2.2660511959714644, "step": 2700}, {"loss": 1.6084, "grad_norm": 0.4781411588191986, "learning_rate": 0.0002, "epoch": 2.274443978178766, "step": 2710}, {"loss": 1.6292, "grad_norm": 0.4282242953777313, "learning_rate": 0.0002, "epoch": 2.282836760386068, "step": 2720}, {"loss": 1.5238, "grad_norm": 0.4499875605106354, "learning_rate": 0.0002, "epoch": 2.2912295425933698, "step": 2730}, {"loss": 1.5844, "grad_norm": 0.4133218824863434, "learning_rate": 0.0002, "epoch": 2.2996223248006715, "step": 2740}, {"loss": 1.6207, "grad_norm": 0.4706156849861145, "learning_rate": 0.0002, "epoch": 2.3080151070079733, "step": 2750}, {"loss": 1.573, "grad_norm": 0.4537484347820282, "learning_rate": 0.0002, "epoch": 2.3164078892152746, "step": 2760}, {"loss": 1.6556, "grad_norm": 0.39736735820770264, "learning_rate": 0.0002, "epoch": 2.3248006714225764, "step": 2770}, {"loss": 1.7032, "grad_norm": 0.4488453269004822, "learning_rate": 0.0002, "epoch": 2.333193453629878, "step": 2780}, {"loss": 1.6169, "grad_norm": 0.44405487179756165, "learning_rate": 0.0002, "epoch": 2.34158623583718, "step": 2790}, {"loss": 1.5207, "grad_norm": 0.4726555049419403, "learning_rate": 0.0002, "epoch": 2.3499790180444817, "step": 2800}, {"loss": 1.5792, "grad_norm": 0.4820375442504883, "learning_rate": 0.0002, "epoch": 2.3583718002517835, "step": 2810}, {"loss": 1.5774, "grad_norm": 0.46176597476005554, "learning_rate": 0.0002, "epoch": 2.3667645824590853, "step": 2820}, {"loss": 1.6256, "grad_norm": 0.4603394567966461, "learning_rate": 0.0002, "epoch": 2.375157364666387, "step": 2830}, {"loss": 1.6598, "grad_norm": 0.4462946355342865, "learning_rate": 0.0002, "epoch": 2.383550146873689, "step": 2840}, {"loss": 1.5939, "grad_norm": 0.5216080546379089, "learning_rate": 0.0002, "epoch": 2.39194292908099, "step": 2850}, {"loss": 1.5981, "grad_norm": 0.44553086161613464, "learning_rate": 0.0002, "epoch": 2.400335711288292, "step": 2860}, {"loss": 1.6556, "grad_norm": 0.4215725362300873, "learning_rate": 0.0002, "epoch": 2.4087284934955937, "step": 2870}, {"loss": 1.6228, "grad_norm": 0.4646450877189636, "learning_rate": 0.0002, "epoch": 2.4171212757028955, "step": 2880}, {"loss": 1.6547, "grad_norm": 0.44749370217323303, "learning_rate": 0.0002, "epoch": 2.4255140579101973, "step": 2890}, {"loss": 1.6356, "grad_norm": 0.4986693859100342, "learning_rate": 0.0002, "epoch": 2.433906840117499, "step": 2900}, {"loss": 1.6294, "grad_norm": 0.4607609808444977, "learning_rate": 0.0002, "epoch": 2.442299622324801, "step": 2910}, {"loss": 1.6721, "grad_norm": 0.4597654938697815, "learning_rate": 0.0002, "epoch": 2.4506924045321026, "step": 2920}, {"loss": 1.7428, "grad_norm": 0.4106820821762085, "learning_rate": 0.0002, "epoch": 2.4590851867394043, "step": 2930}, {"loss": 1.622, "grad_norm": 0.4531514048576355, "learning_rate": 0.0002, "epoch": 2.4674779689467057, "step": 2940}, {"loss": 1.6367, "grad_norm": 0.4546769857406616, "learning_rate": 0.0002, "epoch": 2.4758707511540075, "step": 2950}, {"loss": 1.6306, "grad_norm": 0.47410622239112854, "learning_rate": 0.0002, "epoch": 2.4842635333613092, "step": 2960}, {"loss": 1.6597, "grad_norm": 0.4498177468776703, "learning_rate": 0.0002, "epoch": 2.492656315568611, "step": 2970}, {"loss": 1.6845, "grad_norm": 0.47267791628837585, "learning_rate": 0.0002, "epoch": 2.5010490977759128, "step": 2980}, {"loss": 1.601, "grad_norm": 0.4340207576751709, "learning_rate": 0.0002, "epoch": 2.5094418799832146, "step": 2990}, {"loss": 1.5783, "grad_norm": 0.43454936146736145, "learning_rate": 0.0002, "epoch": 2.5178346621905163, "step": 3000}, {"loss": 1.5773, "grad_norm": 0.43459394574165344, "learning_rate": 0.0002, "epoch": 2.5262274443978177, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4716770052909851, "learning_rate": 0.0002, "epoch": 2.5346202266051194, "step": 3020}, {"loss": 1.626, "grad_norm": 0.4339194595813751, "learning_rate": 0.0002, "epoch": 2.543013008812421, "step": 3030}, {"loss": 1.6053, "grad_norm": 0.4655593931674957, "learning_rate": 0.0002, "epoch": 2.551405791019723, "step": 3040}, {"loss": 1.5871, "grad_norm": 0.5480475425720215, "learning_rate": 0.0002, "epoch": 2.5597985732270248, "step": 3050}, {"loss": 1.7056, "grad_norm": 0.4783174991607666, "learning_rate": 0.0002, "epoch": 2.5681913554343265, "step": 3060}, {"loss": 1.5691, "grad_norm": 0.45062026381492615, "learning_rate": 0.0002, "epoch": 2.5765841376416283, "step": 3070}, {"loss": 1.7005, "grad_norm": 0.4559392035007477, "learning_rate": 0.0002, "epoch": 2.58497691984893, "step": 3080}, {"loss": 1.6414, "grad_norm": 0.6581618785858154, "learning_rate": 0.0002, "epoch": 2.593369702056232, "step": 3090}, {"loss": 1.6707, "grad_norm": 0.48549333214759827, "learning_rate": 0.0002, "epoch": 2.601762484263533, "step": 3100}, {"loss": 1.6128, "grad_norm": 0.5358436107635498, "learning_rate": 0.0002, "epoch": 2.610155266470835, "step": 3110}, {"loss": 1.6507, "grad_norm": 0.5380043983459473, "learning_rate": 0.0002, "epoch": 2.6185480486781367, "step": 3120}, {"loss": 1.6394, "grad_norm": 0.49887847900390625, "learning_rate": 0.0002, "epoch": 2.6269408308854385, "step": 3130}, {"loss": 1.6464, "grad_norm": 0.46039602160453796, "learning_rate": 0.0002, "epoch": 2.6353336130927403, "step": 3140}, {"loss": 1.6337, "grad_norm": 0.416098952293396, "learning_rate": 0.0002, "epoch": 2.643726395300042, "step": 3150}, {"loss": 1.6295, "grad_norm": 0.465326726436615, "learning_rate": 0.0002, "epoch": 2.652119177507344, "step": 3160}, {"loss": 1.5806, "grad_norm": 0.47029924392700195, "learning_rate": 0.0002, "epoch": 2.660511959714645, "step": 3170}, {"loss": 1.6268, "grad_norm": 0.5063307285308838, "learning_rate": 0.0002, "epoch": 2.6689047419219474, "step": 3180}, {"loss": 1.5718, "grad_norm": 0.42928868532180786, "learning_rate": 0.0002, "epoch": 2.6772975241292487, "step": 3190}, {"loss": 1.6113, "grad_norm": 0.4170134365558624, "learning_rate": 0.0002, "epoch": 2.6856903063365505, "step": 3200}, {"loss": 1.6337, "grad_norm": 0.47810474038124084, "learning_rate": 0.0002, "epoch": 2.6940830885438523, "step": 3210}, {"loss": 1.6808, "grad_norm": 0.44440609216690063, "learning_rate": 0.0002, "epoch": 2.702475870751154, "step": 3220}, {"loss": 1.5611, "grad_norm": 0.482759565114975, "learning_rate": 0.0002, "epoch": 2.710868652958456, "step": 3230}, {"loss": 1.6265, "grad_norm": 0.4325942099094391, "learning_rate": 0.0002, "epoch": 2.7192614351657576, "step": 3240}, {"loss": 1.585, "grad_norm": 0.502498984336853, "learning_rate": 0.0002, "epoch": 2.7276542173730594, "step": 3250}, {"loss": 1.7179, "grad_norm": 0.4725162982940674, "learning_rate": 0.0002, "epoch": 2.7360469995803607, "step": 3260}, {"loss": 1.6591, "grad_norm": 0.46781349182128906, "learning_rate": 0.0002, "epoch": 2.7444397817876625, "step": 3270}, {"loss": 1.6625, "grad_norm": 0.47366851568222046, "learning_rate": 0.0002, "epoch": 2.7528325639949642, "step": 3280}, {"loss": 1.6437, "grad_norm": 0.5101882815361023, "learning_rate": 0.0002, "epoch": 2.761225346202266, "step": 3290}, {"loss": 1.6488, "grad_norm": 0.4874587059020996, "learning_rate": 0.0002, "epoch": 2.769618128409568, "step": 3300}, {"loss": 1.6151, "grad_norm": 0.4989369213581085, "learning_rate": 0.0002, "epoch": 2.7780109106168696, "step": 3310}, {"loss": 1.6786, "grad_norm": 0.48041442036628723, "learning_rate": 0.0002, "epoch": 2.7864036928241713, "step": 3320}, {"loss": 1.6137, "grad_norm": 0.4845651090145111, "learning_rate": 0.0002, "epoch": 2.7947964750314727, "step": 3330}, {"loss": 1.7154, "grad_norm": 0.48575496673583984, "learning_rate": 0.0002, "epoch": 2.803189257238775, "step": 3340}, {"loss": 1.6771, "grad_norm": 0.509726881980896, "learning_rate": 0.0002, "epoch": 2.811582039446076, "step": 3350}, {"loss": 1.6937, "grad_norm": 0.5026665329933167, "learning_rate": 0.0002, "epoch": 2.819974821653378, "step": 3360}, {"loss": 1.623, "grad_norm": 0.4727601706981659, "learning_rate": 0.0002, "epoch": 2.8283676038606798, "step": 3370}, {"loss": 1.6811, "grad_norm": 0.41952234506607056, "learning_rate": 0.0002, "epoch": 2.8367603860679815, "step": 3380}, {"loss": 1.6639, "grad_norm": 0.49663856625556946, "learning_rate": 0.0002, "epoch": 2.8451531682752833, "step": 3390}, {"loss": 1.6389, "grad_norm": 0.4934511184692383, "learning_rate": 0.0002, "epoch": 2.853545950482585, "step": 3400}, {"loss": 1.6362, "grad_norm": 0.4673226773738861, "learning_rate": 0.0002, "epoch": 2.861938732689887, "step": 3410}, {"loss": 1.641, "grad_norm": 0.48972779512405396, "learning_rate": 0.0002, "epoch": 2.870331514897188, "step": 3420}, {"loss": 1.6047, "grad_norm": 0.5008330345153809, "learning_rate": 0.0002, "epoch": 2.8787242971044904, "step": 3430}, {"loss": 1.6867, "grad_norm": 0.43337664008140564, "learning_rate": 0.0002, "epoch": 2.8871170793117917, "step": 3440}, {"loss": 1.5501, "grad_norm": 0.4430622458457947, "learning_rate": 0.0002, "epoch": 2.8955098615190935, "step": 3450}, {"loss": 1.6415, "grad_norm": 0.45123326778411865, "learning_rate": 0.0002, "epoch": 2.9039026437263953, "step": 3460}, {"loss": 1.5913, "grad_norm": 0.47367340326309204, "learning_rate": 0.0002, "epoch": 2.912295425933697, "step": 3470}, {"loss": 1.5951, "grad_norm": 0.44940701127052307, "learning_rate": 0.0002, "epoch": 2.920688208140999, "step": 3480}, {"loss": 1.6343, "grad_norm": 0.44216281175613403, "learning_rate": 0.0002, "epoch": 2.9290809903483006, "step": 3490}, {"loss": 1.6088, "grad_norm": 0.4824782609939575, "learning_rate": 0.0002, "epoch": 2.9374737725556024, "step": 3500}, {"loss": 1.5949, "grad_norm": 0.43067067861557007, "learning_rate": 0.0002, "epoch": 2.9458665547629037, "step": 3510}, {"loss": 1.547, "grad_norm": 0.46483176946640015, "learning_rate": 0.0002, "epoch": 2.9542593369702055, "step": 3520}, {"loss": 1.5878, "grad_norm": 0.49230799078941345, "learning_rate": 0.0002, "epoch": 2.9626521191775073, "step": 3530}, {"loss": 1.5925, "grad_norm": 0.5081011652946472, "learning_rate": 0.0002, "epoch": 2.971044901384809, "step": 3540}, {"loss": 1.7402, "grad_norm": 0.5326072573661804, "learning_rate": 0.0002, "epoch": 2.979437683592111, "step": 3550}, {"loss": 1.5769, "grad_norm": 0.4981454014778137, "learning_rate": 0.0002, "epoch": 2.9878304657994126, "step": 3560}, {"loss": 1.6073, "grad_norm": 0.4330528676509857, "learning_rate": 0.0002, "epoch": 2.9962232480067144, "step": 3570}, {"eval_loss": 1.824695348739624, "eval_runtime": 37.947, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.999580360889635, "step": 3574}, {"loss": 1.5633, "grad_norm": 0.4380604326725006, "learning_rate": 0.0002, "epoch": 3.004616030214016, "step": 3580}, {"loss": 1.4474, "grad_norm": 0.5375564098358154, "learning_rate": 0.0002, "epoch": 3.0130088124213175, "step": 3590}, {"loss": 1.5738, "grad_norm": 0.50722736120224, "learning_rate": 0.0002, "epoch": 3.0214015946286192, "step": 3600}, {"loss": 1.5191, "grad_norm": 0.5398766994476318, "learning_rate": 0.0002, "epoch": 3.029794376835921, "step": 3610}, {"loss": 1.4401, "grad_norm": 0.520709753036499, "learning_rate": 0.0002, "epoch": 3.038187159043223, "step": 3620}, {"loss": 1.5704, "grad_norm": 0.5429664850234985, "learning_rate": 0.0002, "epoch": 3.0465799412505246, "step": 3630}, {"loss": 1.5516, "grad_norm": 0.5634943842887878, "learning_rate": 0.0002, "epoch": 3.0549727234578263, "step": 3640}, {"loss": 1.5349, "grad_norm": 0.5042277574539185, "learning_rate": 0.0002, "epoch": 3.063365505665128, "step": 3650}, {"loss": 1.4708, "grad_norm": 0.5778711438179016, "learning_rate": 0.0002, "epoch": 3.07175828787243, "step": 3660}, {"loss": 1.5196, "grad_norm": 0.5504926443099976, "learning_rate": 0.0002, "epoch": 3.080151070079731, "step": 3670}, {"loss": 1.473, "grad_norm": 0.5199463963508606, "learning_rate": 0.0002, "epoch": 3.088543852287033, "step": 3680}, {"loss": 1.5064, "grad_norm": 0.552334189414978, "learning_rate": 0.0002, "epoch": 3.0969366344943348, "step": 3690}, {"loss": 1.4638, "grad_norm": 0.5650873780250549, "learning_rate": 0.0002, "epoch": 3.1053294167016365, "step": 3700}, {"loss": 1.4945, "grad_norm": 0.6292349696159363, "learning_rate": 0.0002, "epoch": 3.1137221989089383, "step": 3710}, {"loss": 1.4787, "grad_norm": 0.5523604154586792, "learning_rate": 0.0002, "epoch": 3.12211498111624, "step": 3720}, {"loss": 1.4697, "grad_norm": 0.6160100698471069, "learning_rate": 0.0002, "epoch": 3.130507763323542, "step": 3730}, {"loss": 1.5589, "grad_norm": 0.6091629266738892, "learning_rate": 0.0002, "epoch": 3.1389005455308436, "step": 3740}, {"loss": 1.4659, "grad_norm": 0.5695531964302063, "learning_rate": 0.0002, "epoch": 3.1472933277381454, "step": 3750}, {"loss": 1.4605, "grad_norm": 0.569611132144928, "learning_rate": 0.0002, "epoch": 3.1556861099454467, "step": 3760}, {"loss": 1.4592, "grad_norm": 0.5761140584945679, "learning_rate": 0.0002, "epoch": 3.1640788921527485, "step": 3770}, {"loss": 1.4999, "grad_norm": 0.6855548620223999, "learning_rate": 0.0002, "epoch": 3.1724716743600503, "step": 3780}, {"loss": 1.5047, "grad_norm": 0.5815101265907288, "learning_rate": 0.0002, "epoch": 3.180864456567352, "step": 3790}, {"loss": 1.5289, "grad_norm": 0.6179960370063782, "learning_rate": 0.0002, "epoch": 3.189257238774654, "step": 3800}, {"loss": 1.4833, "grad_norm": 0.5418674349784851, "learning_rate": 0.0002, "epoch": 3.1976500209819556, "step": 3810}, {"loss": 1.4994, "grad_norm": 0.5655816197395325, "learning_rate": 0.0002, "epoch": 3.2060428031892574, "step": 3820}, {"loss": 1.5007, "grad_norm": 0.7279291152954102, "learning_rate": 0.0002, "epoch": 3.214435585396559, "step": 3830}, {"loss": 1.5672, "grad_norm": 0.490998238325119, "learning_rate": 0.0002, "epoch": 3.2228283676038605, "step": 3840}, {"loss": 1.4683, "grad_norm": 0.6065797209739685, "learning_rate": 0.0002, "epoch": 3.2312211498111623, "step": 3850}, {"loss": 1.5153, "grad_norm": 0.6024682521820068, "learning_rate": 0.0002, "epoch": 3.239613932018464, "step": 3860}, {"loss": 1.5123, "grad_norm": 0.5571125745773315, "learning_rate": 0.0002, "epoch": 3.248006714225766, "step": 3870}, {"loss": 1.4609, "grad_norm": 0.5662134289741516, "learning_rate": 0.0002, "epoch": 3.2563994964330676, "step": 3880}, {"loss": 1.5452, "grad_norm": 0.5936661958694458, "learning_rate": 0.0002, "epoch": 3.2647922786403694, "step": 3890}, {"loss": 1.5149, "grad_norm": 0.6739671230316162, "learning_rate": 0.0002, "epoch": 3.273185060847671, "step": 3900}, {"loss": 1.5101, "grad_norm": 0.5579532384872437, "learning_rate": 0.0002, "epoch": 3.281577843054973, "step": 3910}, {"loss": 1.4788, "grad_norm": 0.6595954298973083, "learning_rate": 0.0002, "epoch": 3.2899706252622742, "step": 3920}, {"loss": 1.473, "grad_norm": 0.5712262988090515, "learning_rate": 0.0002, "epoch": 3.298363407469576, "step": 3930}, {"loss": 1.5512, "grad_norm": 0.5601761341094971, "learning_rate": 0.0002, "epoch": 3.306756189676878, "step": 3940}, {"loss": 1.4904, "grad_norm": 0.5759967565536499, "learning_rate": 0.0002, "epoch": 3.3151489718841796, "step": 3950}, {"loss": 1.4885, "grad_norm": 0.6543047428131104, "learning_rate": 0.0002, "epoch": 3.3235417540914813, "step": 3960}, {"loss": 1.5063, "grad_norm": 0.6355253458023071, "learning_rate": 0.0002, "epoch": 3.331934536298783, "step": 3970}, {"loss": 1.5025, "grad_norm": 0.5671007633209229, "learning_rate": 0.0002, "epoch": 3.340327318506085, "step": 3980}, {"loss": 1.5049, "grad_norm": 0.6743636727333069, "learning_rate": 0.0002, "epoch": 3.3487201007133867, "step": 3990}, {"loss": 1.5527, "grad_norm": 0.500627338886261, "learning_rate": 0.0002, "epoch": 3.3571128829206884, "step": 4000}, {"loss": 1.4884, "grad_norm": 0.5666340589523315, "learning_rate": 0.0002, "epoch": 3.3655056651279898, "step": 4010}, {"loss": 1.5104, "grad_norm": 0.5651408433914185, "learning_rate": 0.0002, "epoch": 3.3738984473352915, "step": 4020}, {"loss": 1.4907, "grad_norm": 0.6338897943496704, "learning_rate": 0.0002, "epoch": 3.3822912295425933, "step": 4030}, {"loss": 1.553, "grad_norm": 0.5781935453414917, "learning_rate": 0.0002, "epoch": 3.390684011749895, "step": 4040}, {"loss": 1.5535, "grad_norm": 0.55543053150177, "learning_rate": 0.0002, "epoch": 3.399076793957197, "step": 4050}, {"loss": 1.4884, "grad_norm": 0.6602614521980286, "learning_rate": 0.0002, "epoch": 3.4074695761644986, "step": 4060}, {"loss": 1.471, "grad_norm": 0.5514156222343445, "learning_rate": 0.0002, "epoch": 3.4158623583718004, "step": 4070}, {"loss": 1.4634, "grad_norm": 0.5760560035705566, "learning_rate": 0.0002, "epoch": 3.4242551405791017, "step": 4080}, {"loss": 1.4662, "grad_norm": 0.657503604888916, "learning_rate": 0.0002, "epoch": 3.4326479227864035, "step": 4090}, {"loss": 1.5041, "grad_norm": 0.5746736526489258, "learning_rate": 0.0002, "epoch": 3.4410407049937053, "step": 4100}, {"loss": 1.4387, "grad_norm": 0.5988999009132385, "learning_rate": 0.0002, "epoch": 3.449433487201007, "step": 4110}, {"loss": 1.5475, "grad_norm": 0.7294586300849915, "learning_rate": 0.0002, "epoch": 3.457826269408309, "step": 4120}, {"loss": 1.4878, "grad_norm": 0.6391161680221558, "learning_rate": 0.0002, "epoch": 3.4662190516156106, "step": 4130}, {"loss": 1.5366, "grad_norm": 0.6416470408439636, "learning_rate": 0.0002, "epoch": 3.4746118338229124, "step": 4140}, {"loss": 1.5587, "grad_norm": 0.5710626244544983, "learning_rate": 0.0002, "epoch": 3.483004616030214, "step": 4150}, {"loss": 1.4661, "grad_norm": 0.5370054841041565, "learning_rate": 0.0002, "epoch": 3.491397398237516, "step": 4160}, {"loss": 1.5167, "grad_norm": 0.5559558272361755, "learning_rate": 0.0002, "epoch": 3.4997901804448173, "step": 4170}, {"loss": 1.4244, "grad_norm": 0.5426168441772461, "learning_rate": 0.0002, "epoch": 3.508182962652119, "step": 4180}, {"loss": 1.5241, "grad_norm": 0.5997438430786133, "learning_rate": 0.0002, "epoch": 3.516575744859421, "step": 4190}, {"loss": 1.6091, "grad_norm": 0.5399143099784851, "learning_rate": 0.0002, "epoch": 3.5249685270667226, "step": 4200}, {"loss": 1.5066, "grad_norm": 0.6341416239738464, "learning_rate": 0.0002, "epoch": 3.5333613092740244, "step": 4210}, {"loss": 1.5436, "grad_norm": 0.632238507270813, "learning_rate": 0.0002, "epoch": 3.541754091481326, "step": 4220}, {"loss": 1.5423, "grad_norm": 0.6356478333473206, "learning_rate": 0.0002, "epoch": 3.550146873688628, "step": 4230}, {"loss": 1.483, "grad_norm": 0.6379408240318298, "learning_rate": 0.0002, "epoch": 3.5585396558959292, "step": 4240}, {"loss": 1.5184, "grad_norm": 0.6265586018562317, "learning_rate": 0.0002, "epoch": 3.5669324381032315, "step": 4250}, {"loss": 1.5047, "grad_norm": 0.5378820896148682, "learning_rate": 0.0002, "epoch": 3.575325220310533, "step": 4260}, {"loss": 1.5668, "grad_norm": 0.6800801753997803, "learning_rate": 0.0002, "epoch": 3.5837180025178346, "step": 4270}, {"loss": 1.5363, "grad_norm": 0.5653113126754761, "learning_rate": 0.0002, "epoch": 3.5921107847251363, "step": 4280}, {"loss": 1.5007, "grad_norm": 0.548647940158844, "learning_rate": 0.0002, "epoch": 3.600503566932438, "step": 4290}, {"loss": 1.5034, "grad_norm": 0.5729944705963135, "learning_rate": 0.0002, "epoch": 3.60889634913974, "step": 4300}, {"loss": 1.575, "grad_norm": 0.6204999685287476, "learning_rate": 0.0002, "epoch": 3.6172891313470417, "step": 4310}, {"loss": 1.5107, "grad_norm": 0.6275812983512878, "learning_rate": 0.0002, "epoch": 3.6256819135543434, "step": 4320}, {"loss": 1.5013, "grad_norm": 0.7261835336685181, "learning_rate": 0.0002, "epoch": 3.6340746957616448, "step": 4330}, {"loss": 1.5128, "grad_norm": 0.6048004627227783, "learning_rate": 0.0002, "epoch": 3.6424674779689465, "step": 4340}, {"loss": 1.5106, "grad_norm": 0.5879671573638916, "learning_rate": 0.0002, "epoch": 3.6508602601762483, "step": 4350}, {"loss": 1.5477, "grad_norm": 0.6001018285751343, "learning_rate": 0.0002, "epoch": 3.65925304238355, "step": 4360}, {"loss": 1.5247, "grad_norm": 0.6468151211738586, "learning_rate": 0.0002, "epoch": 3.667645824590852, "step": 4370}, {"loss": 1.563, "grad_norm": 0.6342051029205322, "learning_rate": 0.0002, "epoch": 3.6760386067981536, "step": 4380}, {"loss": 1.5444, "grad_norm": 0.6078384518623352, "learning_rate": 0.0002, "epoch": 3.6844313890054554, "step": 4390}, {"loss": 1.5546, "grad_norm": 0.5555588006973267, "learning_rate": 0.0002, "epoch": 3.692824171212757, "step": 4400}, {"loss": 1.5694, "grad_norm": 0.6089665293693542, "learning_rate": 0.0002, "epoch": 3.701216953420059, "step": 4410}, {"loss": 1.5898, "grad_norm": 0.6225191950798035, "learning_rate": 0.0002, "epoch": 3.7096097356273603, "step": 4420}, {"loss": 1.5153, "grad_norm": 0.5642715692520142, "learning_rate": 0.0002, "epoch": 3.718002517834662, "step": 4430}, {"loss": 1.5057, "grad_norm": 0.5703449845314026, "learning_rate": 0.0002, "epoch": 3.726395300041964, "step": 4440}, {"loss": 1.5451, "grad_norm": 0.6029745936393738, "learning_rate": 0.0002, "epoch": 3.7347880822492656, "step": 4450}, {"loss": 1.5044, "grad_norm": 0.7089189887046814, "learning_rate": 0.0002, "epoch": 3.7431808644565674, "step": 4460}, {"loss": 1.4804, "grad_norm": 0.6230936050415039, "learning_rate": 0.0002, "epoch": 3.751573646663869, "step": 4470}, {"loss": 1.567, "grad_norm": 0.5718494653701782, "learning_rate": 0.0002, "epoch": 3.759966428871171, "step": 4480}, {"loss": 1.5612, "grad_norm": 0.5404117703437805, "learning_rate": 0.0002, "epoch": 3.7683592110784723, "step": 4490}, {"loss": 1.4707, "grad_norm": 0.5816529393196106, "learning_rate": 0.0002, "epoch": 3.7767519932857745, "step": 4500}, {"loss": 1.5802, "grad_norm": 0.6314901113510132, "learning_rate": 0.0002, "epoch": 3.785144775493076, "step": 4510}, {"loss": 1.5445, "grad_norm": 0.7639698386192322, "learning_rate": 0.0002, "epoch": 3.7935375577003776, "step": 4520}, {"loss": 1.5718, "grad_norm": 0.5727366209030151, "learning_rate": 0.0002, "epoch": 3.8019303399076794, "step": 4530}, {"loss": 1.5409, "grad_norm": 0.6467128396034241, "learning_rate": 0.0002, "epoch": 3.810323122114981, "step": 4540}, {"loss": 1.5266, "grad_norm": 0.6572837233543396, "learning_rate": 0.0002, "epoch": 3.818715904322283, "step": 4550}, {"loss": 1.5718, "grad_norm": 0.5847418904304504, "learning_rate": 0.0002, "epoch": 3.8271086865295847, "step": 4560}, {"loss": 1.5303, "grad_norm": 0.48820871114730835, "learning_rate": 0.0002, "epoch": 3.8355014687368865, "step": 4570}, {"loss": 1.4911, "grad_norm": 1.2537429332733154, "learning_rate": 0.0002, "epoch": 3.843894250944188, "step": 4580}, {"loss": 1.5522, "grad_norm": 0.6026989221572876, "learning_rate": 0.0002, "epoch": 3.8522870331514896, "step": 4590}, {"loss": 1.5035, "grad_norm": 0.5541417598724365, "learning_rate": 0.0002, "epoch": 3.8606798153587913, "step": 4600}, {"loss": 1.5238, "grad_norm": 0.7668771147727966, "learning_rate": 0.0002, "epoch": 3.869072597566093, "step": 4610}, {"loss": 1.5428, "grad_norm": 0.6181227564811707, "learning_rate": 0.0002, "epoch": 3.877465379773395, "step": 4620}, {"loss": 1.5242, "grad_norm": 0.5842700004577637, "learning_rate": 0.0002, "epoch": 3.8858581619806967, "step": 4630}, {"loss": 1.5501, "grad_norm": 0.5824751257896423, "learning_rate": 0.0002, "epoch": 3.8942509441879984, "step": 4640}, {"loss": 1.4443, "grad_norm": 0.6212735772132874, "learning_rate": 0.0002, "epoch": 3.9026437263952998, "step": 4650}, {"loss": 1.4972, "grad_norm": 0.6123346090316772, "learning_rate": 0.0002, "epoch": 3.911036508602602, "step": 4660}, {"loss": 1.5531, "grad_norm": 0.518662691116333, "learning_rate": 0.0002, "epoch": 3.9194292908099033, "step": 4670}, {"loss": 1.5151, "grad_norm": 0.6963476538658142, "learning_rate": 0.0002, "epoch": 3.927822073017205, "step": 4680}, {"loss": 1.5826, "grad_norm": 0.5192152261734009, "learning_rate": 0.0002, "epoch": 3.936214855224507, "step": 4690}, {"loss": 1.5312, "grad_norm": 0.5820888876914978, "learning_rate": 0.0002, "epoch": 3.9446076374318086, "step": 4700}, {"loss": 1.527, "grad_norm": 0.6320387721061707, "learning_rate": 0.0002, "epoch": 3.9530004196391104, "step": 4710}, {"loss": 1.6006, "grad_norm": 0.6174548268318176, "learning_rate": 0.0002, "epoch": 3.961393201846412, "step": 4720}, {"loss": 1.5581, "grad_norm": 0.6691966652870178, "learning_rate": 0.0002, "epoch": 3.969785984053714, "step": 4730}, {"loss": 1.4762, "grad_norm": 0.5972068309783936, "learning_rate": 0.0002, "epoch": 3.9781787662610153, "step": 4740}, {"loss": 1.4947, "grad_norm": 0.5759536027908325, "learning_rate": 0.0002, "epoch": 3.9865715484683175, "step": 4750}, {"loss": 1.4836, "grad_norm": 0.5886756777763367, "learning_rate": 0.0002, "epoch": 3.994964330675619, "step": 4760}, {"eval_loss": 1.8749940395355225, "eval_runtime": 38.037, "eval_samples_per_second": 13.539, "eval_steps_per_second": 1.709, "epoch": 4.0, "step": 4766}, {"loss": 1.5259, "grad_norm": 0.5915011167526245, "learning_rate": 0.0002, "epoch": 4.003357112882921, "step": 4770}, {"loss": 1.4071, "grad_norm": 0.8565000891685486, "learning_rate": 0.0002, "epoch": 4.011749895090222, "step": 4780}, {"loss": 1.3211, "grad_norm": 0.7753950953483582, "learning_rate": 0.0002, "epoch": 4.020142677297524, "step": 4790}, {"loss": 1.3607, "grad_norm": 0.6837254166603088, "learning_rate": 0.0002, "epoch": 4.028535459504826, "step": 4800}, {"loss": 1.3275, "grad_norm": 0.8374526500701904, "learning_rate": 0.0002, "epoch": 4.036928241712127, "step": 4810}, {"loss": 1.3579, "grad_norm": 0.8717963099479675, "learning_rate": 0.0002, "epoch": 4.0453210239194295, "step": 4820}, {"loss": 1.3374, "grad_norm": 0.7002043724060059, "learning_rate": 0.0002, "epoch": 4.053713806126731, "step": 4830}, {"loss": 1.3882, "grad_norm": 1.0319572687149048, "learning_rate": 0.0002, "epoch": 4.062106588334033, "step": 4840}, {"loss": 1.3291, "grad_norm": 0.6746882200241089, "learning_rate": 0.0002, "epoch": 4.070499370541334, "step": 4850}, {"loss": 1.339, "grad_norm": 0.8187578320503235, "learning_rate": 0.0002, "epoch": 4.078892152748637, "step": 4860}, {"loss": 1.368, "grad_norm": 0.7888399362564087, "learning_rate": 0.0002, "epoch": 4.087284934955938, "step": 4870}, {"loss": 1.4115, "grad_norm": 0.7149351239204407, "learning_rate": 0.0002, "epoch": 4.095677717163239, "step": 4880}, {"loss": 1.341, "grad_norm": 0.9067983031272888, "learning_rate": 0.0002, "epoch": 4.1040704993705415, "step": 4890}, {"loss": 1.4084, "grad_norm": 0.771186351776123, "learning_rate": 0.0002, "epoch": 4.112463281577843, "step": 4900}, {"loss": 1.2722, "grad_norm": 0.7756485342979431, "learning_rate": 0.0002, "epoch": 4.120856063785145, "step": 4910}, {"loss": 1.4138, "grad_norm": 0.7149116396903992, "learning_rate": 0.0002, "epoch": 4.129248845992446, "step": 4920}, {"loss": 1.3102, "grad_norm": 0.700442910194397, "learning_rate": 0.0002, "epoch": 4.137641628199749, "step": 4930}, {"loss": 1.3628, "grad_norm": 0.8439189195632935, "learning_rate": 0.0002, "epoch": 4.14603441040705, "step": 4940}, {"loss": 1.3511, "grad_norm": 0.6570779085159302, "learning_rate": 0.0002, "epoch": 4.154427192614351, "step": 4950}, {"loss": 1.3955, "grad_norm": 0.886482298374176, "learning_rate": 0.0002, "epoch": 4.1628199748216534, "step": 4960}, {"loss": 1.4083, "grad_norm": 0.7220938801765442, "learning_rate": 0.0002, "epoch": 4.171212757028955, "step": 4970}, {"loss": 1.3611, "grad_norm": 0.7185905575752258, "learning_rate": 0.0002, "epoch": 4.179605539236257, "step": 4980}, {"loss": 1.3623, "grad_norm": 0.7566333413124084, "learning_rate": 0.0002, "epoch": 4.187998321443558, "step": 4990}, {"loss": 1.2771, "grad_norm": 0.6960445642471313, "learning_rate": 0.0002, "epoch": 4.1963911036508605, "step": 5000}, {"loss": 1.3565, "grad_norm": 0.7727336883544922, "learning_rate": 0.0002, "epoch": 4.204783885858162, "step": 5010}, {"loss": 1.4156, "grad_norm": 0.8038365244865417, "learning_rate": 0.0002, "epoch": 4.213176668065464, "step": 5020}, {"loss": 1.3849, "grad_norm": 0.7587628364562988, "learning_rate": 0.0002, "epoch": 4.221569450272765, "step": 5030}, {"loss": 1.4047, "grad_norm": 0.928032398223877, "learning_rate": 0.0002, "epoch": 4.229962232480067, "step": 5040}, {"loss": 1.3768, "grad_norm": 0.7168642282485962, "learning_rate": 0.0002, "epoch": 4.238355014687369, "step": 5050}, {"loss": 1.3767, "grad_norm": 0.7981422543525696, "learning_rate": 0.0002, "epoch": 4.24674779689467, "step": 5060}, {"loss": 1.406, "grad_norm": 0.6951150894165039, "learning_rate": 0.0002, "epoch": 4.2551405791019725, "step": 5070}, {"loss": 1.3776, "grad_norm": 0.7337371706962585, "learning_rate": 0.0002, "epoch": 4.263533361309274, "step": 5080}, {"loss": 1.3425, "grad_norm": 0.8367464542388916, "learning_rate": 0.0002, "epoch": 4.271926143516576, "step": 5090}, {"loss": 1.3823, "grad_norm": 0.6744083166122437, "learning_rate": 0.0002, "epoch": 4.280318925723877, "step": 5100}, {"loss": 1.4183, "grad_norm": 0.9072301387786865, "learning_rate": 0.0002, "epoch": 4.28871170793118, "step": 5110}, {"loss": 1.4219, "grad_norm": 0.7703930735588074, "learning_rate": 0.0002, "epoch": 4.297104490138481, "step": 5120}, {"loss": 1.3658, "grad_norm": 0.6734083294868469, "learning_rate": 0.0002, "epoch": 4.305497272345782, "step": 5130}, {"loss": 1.441, "grad_norm": 0.7835540175437927, "learning_rate": 0.0002, "epoch": 4.3138900545530845, "step": 5140}, {"loss": 1.384, "grad_norm": 1.0822200775146484, "learning_rate": 0.0002, "epoch": 4.322282836760386, "step": 5150}, {"loss": 1.4167, "grad_norm": 0.8432536721229553, "learning_rate": 0.0002, "epoch": 4.330675618967688, "step": 5160}, {"loss": 1.3796, "grad_norm": 0.6739283800125122, "learning_rate": 0.0002, "epoch": 4.339068401174989, "step": 5170}, {"loss": 1.3651, "grad_norm": 0.7395278811454773, "learning_rate": 0.0002, "epoch": 4.347461183382292, "step": 5180}, {"loss": 1.3258, "grad_norm": 0.7638891339302063, "learning_rate": 0.0002, "epoch": 4.355853965589593, "step": 5190}, {"loss": 1.34, "grad_norm": 1.1222662925720215, "learning_rate": 0.0002, "epoch": 4.364246747796894, "step": 5200}, {"loss": 1.3757, "grad_norm": 0.9102525115013123, "learning_rate": 0.0002, "epoch": 4.3726395300041965, "step": 5210}, {"loss": 1.413, "grad_norm": 0.7181593775749207, "learning_rate": 0.0002, "epoch": 4.381032312211498, "step": 5220}, {"loss": 1.3808, "grad_norm": 0.7813979387283325, "learning_rate": 0.0002, "epoch": 4.3894250944188, "step": 5230}, {"loss": 1.423, "grad_norm": 0.8906185626983643, "learning_rate": 0.0002, "epoch": 4.397817876626101, "step": 5240}, {"loss": 1.3901, "grad_norm": 0.7456443309783936, "learning_rate": 0.0002, "epoch": 4.406210658833404, "step": 5250}, {"loss": 1.3292, "grad_norm": 0.8752070069313049, "learning_rate": 0.0002, "epoch": 4.414603441040705, "step": 5260}, {"loss": 1.3351, "grad_norm": 0.9560954570770264, "learning_rate": 0.0002, "epoch": 4.422996223248007, "step": 5270}, {"loss": 1.3708, "grad_norm": 0.7227762341499329, "learning_rate": 0.0002, "epoch": 4.4313890054553084, "step": 5280}, {"loss": 1.4281, "grad_norm": 0.8141599893569946, "learning_rate": 0.0002, "epoch": 4.43978178766261, "step": 5290}, {"loss": 1.381, "grad_norm": 0.928382158279419, "learning_rate": 0.0002, "epoch": 4.448174569869912, "step": 5300}, {"loss": 1.3586, "grad_norm": 0.7719997763633728, "learning_rate": 0.0002, "epoch": 4.456567352077213, "step": 5310}, {"loss": 1.3652, "grad_norm": 0.8081879615783691, "learning_rate": 0.0002, "epoch": 4.4649601342845155, "step": 5320}, {"loss": 1.4121, "grad_norm": 0.7903412580490112, "learning_rate": 0.0002, "epoch": 4.473352916491817, "step": 5330}, {"loss": 1.4453, "grad_norm": 0.7751287221908569, "learning_rate": 0.0002, "epoch": 4.481745698699119, "step": 5340}, {"loss": 1.392, "grad_norm": 0.8287544250488281, "learning_rate": 0.0002, "epoch": 4.49013848090642, "step": 5350}, {"loss": 1.3841, "grad_norm": 0.7431012392044067, "learning_rate": 0.0002, "epoch": 4.498531263113723, "step": 5360}, {"loss": 1.3843, "grad_norm": 0.8648661971092224, "learning_rate": 0.0002, "epoch": 4.506924045321024, "step": 5370}, {"loss": 1.3742, "grad_norm": 0.9314997792243958, "learning_rate": 0.0002, "epoch": 4.515316827528325, "step": 5380}, {"loss": 1.354, "grad_norm": 0.7530864477157593, "learning_rate": 0.0002, "epoch": 4.5237096097356275, "step": 5390}, {"loss": 1.4159, "grad_norm": 0.8739821910858154, "learning_rate": 0.0002, "epoch": 4.532102391942929, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.8090344667434692, "learning_rate": 0.0002, "epoch": 4.540495174150231, "step": 5410}, {"loss": 1.4187, "grad_norm": 0.7530879974365234, "learning_rate": 0.0002, "epoch": 4.548887956357532, "step": 5420}, {"loss": 1.47, "grad_norm": 0.8787251114845276, "learning_rate": 0.0002, "epoch": 4.557280738564835, "step": 5430}, {"loss": 1.375, "grad_norm": 0.813961923122406, "learning_rate": 0.0002, "epoch": 4.565673520772136, "step": 5440}, {"loss": 1.4475, "grad_norm": 0.7778232097625732, "learning_rate": 0.0002, "epoch": 4.574066302979437, "step": 5450}, {"loss": 1.4421, "grad_norm": 0.7323020696640015, "learning_rate": 0.0002, "epoch": 4.5824590851867395, "step": 5460}, {"loss": 1.396, "grad_norm": 0.7826765179634094, "learning_rate": 0.0002, "epoch": 4.590851867394041, "step": 5470}, {"loss": 1.4068, "grad_norm": 0.7245969772338867, "learning_rate": 0.0002, "epoch": 4.599244649601343, "step": 5480}, {"loss": 1.4276, "grad_norm": 0.7697308659553528, "learning_rate": 0.0002, "epoch": 4.607637431808644, "step": 5490}, {"loss": 1.3849, "grad_norm": 0.8053571581840515, "learning_rate": 0.0002, "epoch": 4.616030214015947, "step": 5500}, {"loss": 1.4225, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.624422996223248, "step": 5510}, {"loss": 1.3771, "grad_norm": 0.7398585677146912, "learning_rate": 0.0002, "epoch": 4.632815778430549, "step": 5520}, {"loss": 1.4216, "grad_norm": 0.7896319031715393, "learning_rate": 0.0002, "epoch": 4.6412085606378515, "step": 5530}, {"loss": 1.4199, "grad_norm": 0.8290980458259583, "learning_rate": 0.0002, "epoch": 4.649601342845153, "step": 5540}, {"loss": 1.463, "grad_norm": 0.8232647776603699, "learning_rate": 0.0002, "epoch": 4.657994125052455, "step": 5550}, {"loss": 1.3925, "grad_norm": 0.9154987335205078, "learning_rate": 0.0002, "epoch": 4.666386907259756, "step": 5560}, {"loss": 1.3674, "grad_norm": 0.8400886654853821, "learning_rate": 0.0002, "epoch": 4.674779689467059, "step": 5570}, {"loss": 1.379, "grad_norm": 0.7312718629837036, "learning_rate": 0.0002, "epoch": 4.68317247167436, "step": 5580}, {"loss": 1.3925, "grad_norm": 0.8043803572654724, "learning_rate": 0.0002, "epoch": 4.691565253881662, "step": 5590}, {"loss": 1.3952, "grad_norm": 0.7966225147247314, "learning_rate": 0.0002, "epoch": 4.6999580360889635, "step": 5600}, {"loss": 1.3429, "grad_norm": 0.881574809551239, "learning_rate": 0.0002, "epoch": 4.708350818296266, "step": 5610}, {"loss": 1.4444, "grad_norm": 0.7252084016799927, "learning_rate": 0.0002, "epoch": 4.716743600503567, "step": 5620}, {"loss": 1.3566, "grad_norm": 0.7726518511772156, "learning_rate": 0.0002, "epoch": 4.725136382710868, "step": 5630}, {"loss": 1.3954, "grad_norm": 0.7306379079818726, "learning_rate": 0.0002, "epoch": 4.7335291649181706, "step": 5640}, {"loss": 1.4385, "grad_norm": 0.8029969334602356, "learning_rate": 0.0002, "epoch": 4.741921947125472, "step": 5650}, {"loss": 1.3966, "grad_norm": 0.9103893637657166, "learning_rate": 0.0002, "epoch": 4.750314729332774, "step": 5660}, {"loss": 1.4026, "grad_norm": 0.8783416748046875, "learning_rate": 0.0002, "epoch": 4.758707511540075, "step": 5670}, {"loss": 1.3427, "grad_norm": 0.6807119846343994, "learning_rate": 0.0002, "epoch": 4.767100293747378, "step": 5680}, {"loss": 1.4148, "grad_norm": 0.7103772759437561, "learning_rate": 0.0002, "epoch": 4.775493075954679, "step": 5690}, {"loss": 1.4079, "grad_norm": 0.8472093343734741, "learning_rate": 0.0002, "epoch": 4.78388585816198, "step": 5700}, {"loss": 1.3937, "grad_norm": 0.851847231388092, "learning_rate": 0.0002, "epoch": 4.7922786403692825, "step": 5710}, {"loss": 1.3965, "grad_norm": 0.9084636569023132, "learning_rate": 0.0002, "epoch": 4.800671422576584, "step": 5720}, {"loss": 1.4358, "grad_norm": 0.7628585696220398, "learning_rate": 0.0002, "epoch": 4.809064204783886, "step": 5730}, {"loss": 1.3746, "grad_norm": 0.775580883026123, "learning_rate": 0.0002, "epoch": 4.817456986991187, "step": 5740}, {"loss": 1.4573, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 4.82584976919849, "step": 5750}, {"loss": 1.3991, "grad_norm": 0.7021728754043579, "learning_rate": 0.0002, "epoch": 4.834242551405791, "step": 5760}, {"loss": 1.4012, "grad_norm": 0.7810541391372681, "learning_rate": 0.0002, "epoch": 4.842635333613092, "step": 5770}, {"loss": 1.396, "grad_norm": 0.7290041446685791, "learning_rate": 0.0002, "epoch": 4.8510281158203945, "step": 5780}, {"loss": 1.4769, "grad_norm": 0.9059709906578064, "learning_rate": 0.0002, "epoch": 4.859420898027696, "step": 5790}, {"loss": 1.4091, "grad_norm": 0.8338062167167664, "learning_rate": 0.0002, "epoch": 4.867813680234998, "step": 5800}, {"loss": 1.395, "grad_norm": 0.830926775932312, "learning_rate": 0.0002, "epoch": 4.876206462442299, "step": 5810}, {"loss": 1.4261, "grad_norm": 0.7818633317947388, "learning_rate": 0.0002, "epoch": 4.884599244649602, "step": 5820}, {"loss": 1.4252, "grad_norm": 0.8143376708030701, "learning_rate": 0.0002, "epoch": 4.892992026856903, "step": 5830}, {"loss": 1.3583, "grad_norm": 0.7754496335983276, "learning_rate": 0.0002, "epoch": 4.901384809064205, "step": 5840}, {"loss": 1.4036, "grad_norm": 0.7154468297958374, "learning_rate": 0.0002, "epoch": 4.9097775912715065, "step": 5850}, {"loss": 1.3909, "grad_norm": 0.6829783916473389, "learning_rate": 0.0002, "epoch": 4.918170373478809, "step": 5860}, {"loss": 1.3854, "grad_norm": 0.784919261932373, "learning_rate": 0.0002, "epoch": 4.92656315568611, "step": 5870}, {"loss": 1.4277, "grad_norm": 0.8168354034423828, "learning_rate": 0.0002, "epoch": 4.934955937893411, "step": 5880}, {"loss": 1.3694, "grad_norm": 0.7356618642807007, "learning_rate": 0.0002, "epoch": 4.943348720100714, "step": 5890}, {"loss": 1.4827, "grad_norm": 0.7399224042892456, "learning_rate": 0.0002, "epoch": 4.951741502308015, "step": 5900}, {"loss": 1.3643, "grad_norm": 0.7430436015129089, "learning_rate": 0.0002, "epoch": 4.960134284515317, "step": 5910}, {"loss": 1.3836, "grad_norm": 0.7587705850601196, "learning_rate": 0.0002, "epoch": 4.9685270667226185, "step": 5920}, {"loss": 1.4162, "grad_norm": 0.9103638529777527, "learning_rate": 0.0002, "epoch": 4.976919848929921, "step": 5930}, {"loss": 1.4688, "grad_norm": 0.7357394695281982, "learning_rate": 0.0002, "epoch": 4.985312631137222, "step": 5940}, {"loss": 1.3988, "grad_norm": 0.7371547222137451, "learning_rate": 0.0002, "epoch": 4.993705413344523, "step": 5950}, {"eval_loss": 1.9367210865020752, "eval_runtime": 37.9833, "eval_samples_per_second": 13.559, "eval_steps_per_second": 1.711, "epoch": 4.9995803608896345, "step": 5957}, {"loss": 1.3876, "grad_norm": 0.7783351540565491, "learning_rate": 0.0002, "epoch": 5.0020981955518256, "step": 5960}, {"loss": 1.2387, "grad_norm": 0.9268898367881775, "learning_rate": 0.0002, "epoch": 5.010490977759127, "step": 5970}, {"loss": 1.2621, "grad_norm": 0.9562761783599854, "learning_rate": 0.0002, "epoch": 5.018883759966429, "step": 5980}, {"loss": 1.205, "grad_norm": 0.9391738176345825, "learning_rate": 0.0002, "epoch": 5.02727654217373, "step": 5990}, {"loss": 1.2112, "grad_norm": 0.850326418876648, "learning_rate": 0.0002, "epoch": 5.035669324381033, "step": 6000}, {"loss": 1.2285, "grad_norm": 0.8442679643630981, "learning_rate": 0.0002, "epoch": 5.044062106588334, "step": 6010}, {"loss": 1.1677, "grad_norm": 1.2147290706634521, "learning_rate": 0.0002, "epoch": 5.052454888795635, "step": 6020}, {"loss": 1.1836, "grad_norm": 0.9732922315597534, "learning_rate": 0.0002, "epoch": 5.0608476710029375, "step": 6030}, {"loss": 1.215, "grad_norm": 0.9354516267776489, "learning_rate": 0.0002, "epoch": 5.069240453210239, "step": 6040}, {"loss": 1.1918, "grad_norm": 0.9681560397148132, "learning_rate": 0.0002, "epoch": 5.077633235417541, "step": 6050}, {"loss": 1.2146, "grad_norm": 0.9500439763069153, "learning_rate": 0.0002, "epoch": 5.086026017624842, "step": 6060}, {"loss": 1.1475, "grad_norm": 0.8693879246711731, "learning_rate": 0.0002, "epoch": 5.094418799832145, "step": 6070}, {"loss": 1.2181, "grad_norm": 1.1066458225250244, "learning_rate": 0.0002, "epoch": 5.102811582039446, "step": 6080}, {"loss": 1.2135, "grad_norm": 0.9530285000801086, "learning_rate": 0.0002, "epoch": 5.111204364246748, "step": 6090}, {"loss": 1.2388, "grad_norm": 0.9323630928993225, "learning_rate": 0.0002, "epoch": 5.1195971464540495, "step": 6100}, {"loss": 1.2434, "grad_norm": 0.9040294885635376, "learning_rate": 0.0002, "epoch": 5.127989928661351, "step": 6110}, {"loss": 1.2502, "grad_norm": 0.9981122612953186, "learning_rate": 0.0002, "epoch": 5.136382710868653, "step": 6120}, {"loss": 1.2648, "grad_norm": 0.9070921540260315, "learning_rate": 0.0002, "epoch": 5.144775493075954, "step": 6130}, {"loss": 1.2802, "grad_norm": 1.043802261352539, "learning_rate": 0.0002, "epoch": 5.153168275283257, "step": 6140}, {"loss": 1.1865, "grad_norm": 1.0889761447906494, "learning_rate": 0.0002, "epoch": 5.161561057490558, "step": 6150}, {"loss": 1.2498, "grad_norm": 0.9908999800682068, "learning_rate": 0.0002, "epoch": 5.16995383969786, "step": 6160}, {"loss": 1.2981, "grad_norm": 1.099233865737915, "learning_rate": 0.0002, "epoch": 5.1783466219051615, "step": 6170}, {"loss": 1.2236, "grad_norm": 0.9536478519439697, "learning_rate": 0.0002, "epoch": 5.186739404112464, "step": 6180}, {"loss": 1.1889, "grad_norm": 0.8672952055931091, "learning_rate": 0.0002, "epoch": 5.195132186319765, "step": 6190}, {"loss": 1.2142, "grad_norm": 1.0116329193115234, "learning_rate": 0.0002, "epoch": 5.203524968527066, "step": 6200}, {"loss": 1.1813, "grad_norm": 0.9327153563499451, "learning_rate": 0.0002, "epoch": 5.211917750734369, "step": 6210}, {"loss": 1.2372, "grad_norm": 0.85637366771698, "learning_rate": 0.0002, "epoch": 5.22031053294167, "step": 6220}, {"loss": 1.2949, "grad_norm": 1.0490736961364746, "learning_rate": 0.0002, "epoch": 5.228703315148972, "step": 6230}, {"loss": 1.1604, "grad_norm": 0.8849565982818604, "learning_rate": 0.0002, "epoch": 5.2370960973562735, "step": 6240}, {"loss": 1.2257, "grad_norm": 0.8852671980857849, "learning_rate": 0.0002, "epoch": 5.245488879563576, "step": 6250}, {"loss": 1.275, "grad_norm": 0.9146860241889954, "learning_rate": 0.0002, "epoch": 5.253881661770877, "step": 6260}, {"loss": 1.2543, "grad_norm": 1.0188325643539429, "learning_rate": 0.0002, "epoch": 5.262274443978178, "step": 6270}, {"loss": 1.1703, "grad_norm": 1.0053156614303589, "learning_rate": 0.0002, "epoch": 5.270667226185481, "step": 6280}, {"loss": 1.2594, "grad_norm": 0.9962273836135864, "learning_rate": 0.0002, "epoch": 5.279060008392782, "step": 6290}, {"loss": 1.2487, "grad_norm": 1.000300645828247, "learning_rate": 0.0002, "epoch": 5.287452790600084, "step": 6300}, {"loss": 1.3214, "grad_norm": 0.9821932911872864, "learning_rate": 0.0002, "epoch": 5.295845572807385, "step": 6310}, {"loss": 1.2964, "grad_norm": 1.0103896856307983, "learning_rate": 0.0002, "epoch": 5.304238355014688, "step": 6320}, {"loss": 1.2497, "grad_norm": 0.9323601722717285, "learning_rate": 0.0002, "epoch": 5.312631137221989, "step": 6330}, {"loss": 1.3165, "grad_norm": 1.0668879747390747, "learning_rate": 0.0002, "epoch": 5.321023919429291, "step": 6340}, {"loss": 1.2411, "grad_norm": 0.9666323065757751, "learning_rate": 0.0002, "epoch": 5.3294167016365925, "step": 6350}, {"loss": 1.2129, "grad_norm": 0.9439574480056763, "learning_rate": 0.0002, "epoch": 5.337809483843894, "step": 6360}, {"loss": 1.2355, "grad_norm": 1.0229361057281494, "learning_rate": 0.0002, "epoch": 5.346202266051196, "step": 6370}, {"loss": 1.2021, "grad_norm": 0.8522404432296753, "learning_rate": 0.0002, "epoch": 5.354595048258497, "step": 6380}, {"loss": 1.32, "grad_norm": 1.3732287883758545, "learning_rate": 0.0002, "epoch": 5.3629878304658, "step": 6390}, {"loss": 1.1987, "grad_norm": 0.8201091885566711, "learning_rate": 0.0002, "epoch": 5.371380612673101, "step": 6400}, {"loss": 1.2867, "grad_norm": 0.8874436616897583, "learning_rate": 0.0002, "epoch": 5.379773394880403, "step": 6410}, {"loss": 1.2686, "grad_norm": 1.0118640661239624, "learning_rate": 0.0002, "epoch": 5.3881661770877045, "step": 6420}, {"loss": 1.2952, "grad_norm": 1.0468370914459229, "learning_rate": 0.0002, "epoch": 5.396558959295007, "step": 6430}, {"loss": 1.2057, "grad_norm": 0.941806972026825, "learning_rate": 0.0002, "epoch": 5.404951741502308, "step": 6440}, {"loss": 1.3289, "grad_norm": 0.9860424399375916, "learning_rate": 0.0002, "epoch": 5.413344523709609, "step": 6450}, {"loss": 1.2887, "grad_norm": 1.009628176689148, "learning_rate": 0.0002, "epoch": 5.421737305916912, "step": 6460}, {"loss": 1.2544, "grad_norm": 0.9842159748077393, "learning_rate": 0.0002, "epoch": 5.430130088124213, "step": 6470}, {"loss": 1.2277, "grad_norm": 0.9935571551322937, "learning_rate": 0.0002, "epoch": 5.438522870331515, "step": 6480}, {"loss": 1.2392, "grad_norm": 0.8872362971305847, "learning_rate": 0.0002, "epoch": 5.4469156525388165, "step": 6490}, {"loss": 1.2166, "grad_norm": 0.9530836939811707, "learning_rate": 0.0002, "epoch": 5.455308434746119, "step": 6500}, {"loss": 1.2138, "grad_norm": 0.8111279010772705, "learning_rate": 0.0002, "epoch": 5.46370121695342, "step": 6510}, {"loss": 1.2375, "grad_norm": 1.0474516153335571, "learning_rate": 0.0002, "epoch": 5.472093999160721, "step": 6520}, {"loss": 1.2752, "grad_norm": 1.0228482484817505, "learning_rate": 0.0002, "epoch": 5.480486781368024, "step": 6530}, {"loss": 1.2739, "grad_norm": 1.0299347639083862, "learning_rate": 0.0002, "epoch": 5.488879563575325, "step": 6540}, {"loss": 1.3163, "grad_norm": 0.9105098247528076, "learning_rate": 0.0002, "epoch": 5.497272345782627, "step": 6550}, {"loss": 1.2718, "grad_norm": 1.2459523677825928, "learning_rate": 0.0002, "epoch": 5.5056651279899285, "step": 6560}, {"loss": 1.2697, "grad_norm": 1.0630481243133545, "learning_rate": 0.0002, "epoch": 5.514057910197231, "step": 6570}, {"loss": 1.3003, "grad_norm": 0.8310980796813965, "learning_rate": 0.0002, "epoch": 5.522450692404532, "step": 6580}, {"loss": 1.1855, "grad_norm": 1.102723479270935, "learning_rate": 0.0002, "epoch": 5.530843474611833, "step": 6590}, {"loss": 1.2889, "grad_norm": 0.9586807489395142, "learning_rate": 0.0002, "epoch": 5.539236256819136, "step": 6600}, {"loss": 1.2899, "grad_norm": 0.976191520690918, "learning_rate": 0.0002, "epoch": 5.547629039026437, "step": 6610}, {"loss": 1.2319, "grad_norm": 0.9943762421607971, "learning_rate": 0.0002, "epoch": 5.556021821233739, "step": 6620}, {"loss": 1.3103, "grad_norm": 0.8788089156150818, "learning_rate": 0.0002, "epoch": 5.56441460344104, "step": 6630}, {"loss": 1.1982, "grad_norm": 0.9866173267364502, "learning_rate": 0.0002, "epoch": 5.572807385648343, "step": 6640}, {"loss": 1.2686, "grad_norm": 1.0791642665863037, "learning_rate": 0.0002, "epoch": 5.581200167855644, "step": 6650}, {"loss": 1.2806, "grad_norm": 0.836482584476471, "learning_rate": 0.0002, "epoch": 5.589592950062946, "step": 6660}, {"loss": 1.3114, "grad_norm": 0.9841130971908569, "learning_rate": 0.0002, "epoch": 5.5979857322702475, "step": 6670}, {"loss": 1.2323, "grad_norm": 0.9678813815116882, "learning_rate": 0.0002, "epoch": 5.60637851447755, "step": 6680}, {"loss": 1.1969, "grad_norm": 0.9033233523368835, "learning_rate": 0.0002, "epoch": 5.614771296684851, "step": 6690}, {"loss": 1.2565, "grad_norm": 0.8691515922546387, "learning_rate": 0.0002, "epoch": 5.623164078892152, "step": 6700}, {"loss": 1.2678, "grad_norm": 0.8971360921859741, "learning_rate": 0.0002, "epoch": 5.631556861099455, "step": 6710}, {"loss": 1.2266, "grad_norm": 0.9377756118774414, "learning_rate": 0.0002, "epoch": 5.639949643306756, "step": 6720}, {"loss": 1.28, "grad_norm": 0.908762514591217, "learning_rate": 0.0002, "epoch": 5.648342425514058, "step": 6730}, {"loss": 1.2499, "grad_norm": 1.0503337383270264, "learning_rate": 0.0002, "epoch": 5.6567352077213595, "step": 6740}, {"loss": 1.3604, "grad_norm": 1.030267357826233, "learning_rate": 0.0002, "epoch": 5.665127989928662, "step": 6750}, {"loss": 1.2223, "grad_norm": 0.9150485992431641, "learning_rate": 0.0002, "epoch": 5.673520772135963, "step": 6760}, {"loss": 1.2651, "grad_norm": 1.0300343036651611, "learning_rate": 0.0002, "epoch": 5.681913554343264, "step": 6770}, {"loss": 1.2506, "grad_norm": 1.1242924928665161, "learning_rate": 0.0002, "epoch": 5.690306336550567, "step": 6780}, {"loss": 1.3318, "grad_norm": 0.9489498138427734, "learning_rate": 0.0002, "epoch": 5.698699118757868, "step": 6790}, {"loss": 1.2578, "grad_norm": 0.8829707503318787, "learning_rate": 0.0002, "epoch": 5.70709190096517, "step": 6800}, {"loss": 1.2765, "grad_norm": 1.01392662525177, "learning_rate": 0.0002, "epoch": 5.7154846831724715, "step": 6810}, {"loss": 1.3029, "grad_norm": 0.9234510064125061, "learning_rate": 0.0002, "epoch": 5.723877465379774, "step": 6820}, {"loss": 1.2891, "grad_norm": 0.9439187049865723, "learning_rate": 0.0002, "epoch": 5.732270247587075, "step": 6830}, {"loss": 1.2627, "grad_norm": 0.8833441734313965, "learning_rate": 0.0002, "epoch": 5.740663029794376, "step": 6840}, {"loss": 1.3195, "grad_norm": 0.9394439458847046, "learning_rate": 0.0002, "epoch": 5.749055812001679, "step": 6850}, {"loss": 1.3108, "grad_norm": 0.9980010390281677, "learning_rate": 0.0002, "epoch": 5.75744859420898, "step": 6860}, {"loss": 1.2958, "grad_norm": 0.9612377882003784, "learning_rate": 0.0002, "epoch": 5.765841376416282, "step": 6870}, {"loss": 1.2173, "grad_norm": 1.0817323923110962, "learning_rate": 0.0002, "epoch": 5.7742341586235835, "step": 6880}, {"loss": 1.2485, "grad_norm": 0.8445103168487549, "learning_rate": 0.0002, "epoch": 5.782626940830886, "step": 6890}, {"loss": 1.2573, "grad_norm": 0.8535459041595459, "learning_rate": 0.0002, "epoch": 5.791019723038187, "step": 6900}, {"loss": 1.2729, "grad_norm": 0.9131284356117249, "learning_rate": 0.0002, "epoch": 5.799412505245489, "step": 6910}, {"loss": 1.1934, "grad_norm": 0.8627726435661316, "learning_rate": 0.0002, "epoch": 5.807805287452791, "step": 6920}, {"loss": 1.3226, "grad_norm": 0.8599951863288879, "learning_rate": 0.0002, "epoch": 5.816198069660093, "step": 6930}, {"loss": 1.3078, "grad_norm": 1.0746861696243286, "learning_rate": 0.0002, "epoch": 5.824590851867394, "step": 6940}, {"loss": 1.2653, "grad_norm": 1.0220543146133423, "learning_rate": 0.0002, "epoch": 5.8329836340746954, "step": 6950}, {"loss": 1.3168, "grad_norm": 0.8891388177871704, "learning_rate": 0.0002, "epoch": 5.841376416281998, "step": 6960}, {"loss": 1.2845, "grad_norm": 1.1404683589935303, "learning_rate": 0.0002, "epoch": 5.849769198489299, "step": 6970}, {"loss": 1.2361, "grad_norm": 0.9665380120277405, "learning_rate": 0.0002, "epoch": 5.858161980696601, "step": 6980}, {"loss": 1.2622, "grad_norm": 0.9837968945503235, "learning_rate": 0.0002, "epoch": 5.8665547629039025, "step": 6990}, {"loss": 1.2973, "grad_norm": 1.0278598070144653, "learning_rate": 0.0002, "epoch": 5.874947545111205, "step": 7000}, {"loss": 1.2334, "grad_norm": 0.9990253448486328, "learning_rate": 0.0002, "epoch": 5.883340327318506, "step": 7010}, {"loss": 1.3508, "grad_norm": 0.9705647230148315, "learning_rate": 0.0002, "epoch": 5.891733109525807, "step": 7020}, {"loss": 1.335, "grad_norm": 0.9672252535820007, "learning_rate": 0.0002, "epoch": 5.90012589173311, "step": 7030}, {"loss": 1.2944, "grad_norm": 0.9467034339904785, "learning_rate": 0.0002, "epoch": 5.908518673940411, "step": 7040}, {"loss": 1.2704, "grad_norm": 0.9506469964981079, "learning_rate": 0.0002, "epoch": 5.916911456147713, "step": 7050}, {"loss": 1.2745, "grad_norm": 0.8936163783073425, "learning_rate": 0.0002, "epoch": 5.9253042383550145, "step": 7060}, {"loss": 1.2702, "grad_norm": 0.956101655960083, "learning_rate": 0.0002, "epoch": 5.933697020562317, "step": 7070}, {"loss": 1.2532, "grad_norm": 0.893535852432251, "learning_rate": 0.0002, "epoch": 5.942089802769618, "step": 7080}, {"loss": 1.342, "grad_norm": 1.0313799381256104, "learning_rate": 0.0002, "epoch": 5.950482584976919, "step": 7090}, {"loss": 1.3398, "grad_norm": 0.8567915558815002, "learning_rate": 0.0002, "epoch": 5.958875367184222, "step": 7100}, {"loss": 1.3127, "grad_norm": 0.9683501720428467, "learning_rate": 0.0002, "epoch": 5.967268149391523, "step": 7110}, {"loss": 1.2522, "grad_norm": 0.9401984214782715, "learning_rate": 0.0002, "epoch": 5.975660931598825, "step": 7120}, {"loss": 1.3211, "grad_norm": 1.0316764116287231, "learning_rate": 0.0002, "epoch": 5.9840537138061265, "step": 7130}, {"loss": 1.2445, "grad_norm": 0.9335392713546753, "learning_rate": 0.0002, "epoch": 5.992446496013429, "step": 7140}]} +{"epoch": 6.9995803608896345, "step": 8340, "epoch_duration": 1294.9648473262787, "total_accumulated_duration": 9042.096393823624, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}, {"eval_loss": 1.8086129426956177, "eval_runtime": 38.0431, "eval_samples_per_second": 13.537, "eval_steps_per_second": 1.709, "epoch": 0.9995803608896349, "step": 1191}, {"loss": 1.6856, "grad_norm": 0.3235187232494354, "learning_rate": 0.0002, "epoch": 1.0071338648762065, "step": 1200}, {"loss": 1.7121, "grad_norm": 0.34884774684906006, "learning_rate": 0.0002, "epoch": 1.0155266470835083, "step": 1210}, {"loss": 1.6779, "grad_norm": 0.3215438425540924, "learning_rate": 0.0002, "epoch": 1.0239194292908098, "step": 1220}, {"loss": 1.6562, "grad_norm": 0.312084823846817, "learning_rate": 0.0002, "epoch": 1.0323122114981116, "step": 1230}, {"loss": 1.7366, "grad_norm": 0.33597758412361145, "learning_rate": 0.0002, "epoch": 1.0407049937054134, "step": 1240}, {"loss": 1.7245, "grad_norm": 0.3421499729156494, "learning_rate": 0.0002, "epoch": 1.0490977759127151, "step": 1250}, {"loss": 1.7331, "grad_norm": 0.3458889126777649, "learning_rate": 0.0002, "epoch": 1.0574905581200167, "step": 1260}, {"loss": 1.6929, "grad_norm": 0.3956579864025116, "learning_rate": 0.0002, "epoch": 1.0658833403273185, "step": 1270}, {"loss": 1.6625, "grad_norm": 0.3217819035053253, "learning_rate": 0.0002, "epoch": 1.0742761225346202, "step": 1280}, {"loss": 1.7488, "grad_norm": 0.31379663944244385, "learning_rate": 0.0002, "epoch": 1.082668904741922, "step": 1290}, {"loss": 1.6331, "grad_norm": 0.37231558561325073, "learning_rate": 0.0002, "epoch": 1.0910616869492236, "step": 1300}, {"loss": 1.6614, "grad_norm": 0.35857918858528137, "learning_rate": 0.0002, "epoch": 1.0994544691565253, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.36637991666793823, "learning_rate": 0.0002, "epoch": 1.1078472513638271, "step": 1320}, {"loss": 1.7245, "grad_norm": 0.3436494469642639, "learning_rate": 0.0002, "epoch": 1.1162400335711289, "step": 1330}, {"loss": 1.6867, "grad_norm": 0.404908150434494, "learning_rate": 0.0002, "epoch": 1.1246328157784307, "step": 1340}, {"loss": 1.7042, "grad_norm": 0.34587544202804565, "learning_rate": 0.0002, "epoch": 1.1330255979857322, "step": 1350}, {"loss": 1.6365, "grad_norm": 0.35142362117767334, "learning_rate": 0.0002, "epoch": 1.141418380193034, "step": 1360}, {"loss": 1.6781, "grad_norm": 0.3511804938316345, "learning_rate": 0.0002, "epoch": 1.1498111624003358, "step": 1370}, {"loss": 1.6824, "grad_norm": 0.3549560308456421, "learning_rate": 0.0002, "epoch": 1.1582039446076373, "step": 1380}, {"loss": 1.7276, "grad_norm": 0.35797521471977234, "learning_rate": 0.0002, "epoch": 1.166596726814939, "step": 1390}, {"loss": 1.7476, "grad_norm": 0.37255269289016724, "learning_rate": 0.0002, "epoch": 1.1749895090222409, "step": 1400}, {"loss": 1.7274, "grad_norm": 0.3680652379989624, "learning_rate": 0.0002, "epoch": 1.1833822912295426, "step": 1410}, {"loss": 1.6751, "grad_norm": 0.400831013917923, "learning_rate": 0.0002, "epoch": 1.1917750734368444, "step": 1420}, {"loss": 1.7961, "grad_norm": 0.39571020007133484, "learning_rate": 0.0002, "epoch": 1.200167855644146, "step": 1430}, {"loss": 1.792, "grad_norm": 0.3843863010406494, "learning_rate": 0.0002, "epoch": 1.2085606378514477, "step": 1440}, {"loss": 1.7072, "grad_norm": 0.3901960551738739, "learning_rate": 0.0002, "epoch": 1.2169534200587495, "step": 1450}, {"loss": 1.6425, "grad_norm": 0.36490726470947266, "learning_rate": 0.0002, "epoch": 1.2253462022660513, "step": 1460}, {"loss": 1.6995, "grad_norm": 0.3739864230155945, "learning_rate": 0.0002, "epoch": 1.2337389844733528, "step": 1470}, {"loss": 1.6795, "grad_norm": 0.39061254262924194, "learning_rate": 0.0002, "epoch": 1.2421317666806546, "step": 1480}, {"loss": 1.6838, "grad_norm": 0.37198659777641296, "learning_rate": 0.0002, "epoch": 1.2505245488879564, "step": 1490}, {"loss": 1.725, "grad_norm": 0.3420586884021759, "learning_rate": 0.0002, "epoch": 1.2589173310952582, "step": 1500}, {"loss": 1.719, "grad_norm": 0.4094347655773163, "learning_rate": 0.0002, "epoch": 1.2673101133025597, "step": 1510}, {"loss": 1.7563, "grad_norm": 0.38997703790664673, "learning_rate": 0.0002, "epoch": 1.2757028955098615, "step": 1520}, {"loss": 1.6651, "grad_norm": 0.35702022910118103, "learning_rate": 0.0002, "epoch": 1.2840956777171633, "step": 1530}, {"loss": 1.6689, "grad_norm": 0.3892163336277008, "learning_rate": 0.0002, "epoch": 1.292488459924465, "step": 1540}, {"loss": 1.7209, "grad_norm": 0.33174318075180054, "learning_rate": 0.0002, "epoch": 1.3008812421317666, "step": 1550}, {"loss": 1.7581, "grad_norm": 0.40701809525489807, "learning_rate": 0.0002, "epoch": 1.3092740243390684, "step": 1560}, {"loss": 1.7229, "grad_norm": 0.36324232816696167, "learning_rate": 0.0002, "epoch": 1.3176668065463701, "step": 1570}, {"loss": 1.6708, "grad_norm": 0.3748789429664612, "learning_rate": 0.0002, "epoch": 1.326059588753672, "step": 1580}, {"loss": 1.67, "grad_norm": 0.40873438119888306, "learning_rate": 0.0002, "epoch": 1.3344523709609737, "step": 1590}, {"loss": 1.7909, "grad_norm": 0.52373206615448, "learning_rate": 0.0002, "epoch": 1.3428451531682752, "step": 1600}, {"loss": 1.7593, "grad_norm": 0.40408164262771606, "learning_rate": 0.0002, "epoch": 1.351237935375577, "step": 1610}, {"loss": 1.7959, "grad_norm": 0.3818126320838928, "learning_rate": 0.0002, "epoch": 1.3596307175828788, "step": 1620}, {"loss": 1.6328, "grad_norm": 0.3457068204879761, "learning_rate": 0.0002, "epoch": 1.3680234997901803, "step": 1630}, {"loss": 1.7017, "grad_norm": 0.33777865767478943, "learning_rate": 0.0002, "epoch": 1.3764162819974821, "step": 1640}, {"loss": 1.7335, "grad_norm": 0.36344218254089355, "learning_rate": 0.0002, "epoch": 1.384809064204784, "step": 1650}, {"loss": 1.7656, "grad_norm": 0.3880128562450409, "learning_rate": 0.0002, "epoch": 1.3932018464120857, "step": 1660}, {"loss": 1.7377, "grad_norm": 0.3906225562095642, "learning_rate": 0.0002, "epoch": 1.4015946286193874, "step": 1670}, {"loss": 1.7041, "grad_norm": 0.35857489705085754, "learning_rate": 0.0002, "epoch": 1.409987410826689, "step": 1680}, {"loss": 1.7175, "grad_norm": 0.3627418279647827, "learning_rate": 0.0002, "epoch": 1.4183801930339908, "step": 1690}, {"loss": 1.6948, "grad_norm": 0.41963326930999756, "learning_rate": 0.0002, "epoch": 1.4267729752412925, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.36280378699302673, "learning_rate": 0.0002, "epoch": 1.435165757448594, "step": 1710}, {"loss": 1.7775, "grad_norm": 0.3868233561515808, "learning_rate": 0.0002, "epoch": 1.4435585396558959, "step": 1720}, {"loss": 1.6963, "grad_norm": 0.3635849356651306, "learning_rate": 0.0002, "epoch": 1.4519513218631976, "step": 1730}, {"loss": 1.7381, "grad_norm": 0.4885194003582001, "learning_rate": 0.0002, "epoch": 1.4603441040704994, "step": 1740}, {"loss": 1.6661, "grad_norm": 0.35194680094718933, "learning_rate": 0.0002, "epoch": 1.4687368862778012, "step": 1750}, {"loss": 1.7841, "grad_norm": 0.34906691312789917, "learning_rate": 0.0002, "epoch": 1.4771296684851027, "step": 1760}, {"loss": 1.7196, "grad_norm": 0.3994184732437134, "learning_rate": 0.0002, "epoch": 1.4855224506924045, "step": 1770}, {"loss": 1.7157, "grad_norm": 0.3599298298358917, "learning_rate": 0.0002, "epoch": 1.4939152328997063, "step": 1780}, {"loss": 1.6966, "grad_norm": 0.3794984221458435, "learning_rate": 0.0002, "epoch": 1.5023080151070078, "step": 1790}, {"loss": 1.7187, "grad_norm": 0.36289724707603455, "learning_rate": 0.0002, "epoch": 1.5107007973143096, "step": 1800}, {"loss": 1.78, "grad_norm": 0.38057321310043335, "learning_rate": 0.0002, "epoch": 1.5190935795216114, "step": 1810}, {"loss": 1.7006, "grad_norm": 0.3771969676017761, "learning_rate": 0.0002, "epoch": 1.5274863617289132, "step": 1820}, {"loss": 1.765, "grad_norm": 0.34788841009140015, "learning_rate": 0.0002, "epoch": 1.535879143936215, "step": 1830}, {"loss": 1.7148, "grad_norm": 0.41352227330207825, "learning_rate": 0.0002, "epoch": 1.5442719261435167, "step": 1840}, {"loss": 1.6654, "grad_norm": 0.35711410641670227, "learning_rate": 0.0002, "epoch": 1.5526647083508183, "step": 1850}, {"loss": 1.6998, "grad_norm": 0.40607622265815735, "learning_rate": 0.0002, "epoch": 1.56105749055812, "step": 1860}, {"loss": 1.713, "grad_norm": 0.3428550660610199, "learning_rate": 0.0002, "epoch": 1.5694502727654216, "step": 1870}, {"loss": 1.7909, "grad_norm": 0.3695414066314697, "learning_rate": 0.0002, "epoch": 1.5778430549727234, "step": 1880}, {"loss": 1.6629, "grad_norm": 0.3798272907733917, "learning_rate": 0.0002, "epoch": 1.5862358371800251, "step": 1890}, {"loss": 1.7412, "grad_norm": 0.3415829837322235, "learning_rate": 0.0002, "epoch": 1.594628619387327, "step": 1900}, {"loss": 1.8233, "grad_norm": 0.3575693666934967, "learning_rate": 0.0002, "epoch": 1.6030214015946287, "step": 1910}, {"loss": 1.6947, "grad_norm": 0.3180370628833771, "learning_rate": 0.0002, "epoch": 1.6114141838019305, "step": 1920}, {"loss": 1.7506, "grad_norm": 0.5018689036369324, "learning_rate": 0.0002, "epoch": 1.619806966009232, "step": 1930}, {"loss": 1.7368, "grad_norm": 0.35676372051239014, "learning_rate": 0.0002, "epoch": 1.6281997482165338, "step": 1940}, {"loss": 1.7159, "grad_norm": 0.3740452229976654, "learning_rate": 0.0002, "epoch": 1.6365925304238353, "step": 1950}, {"loss": 1.6474, "grad_norm": 0.36584731936454773, "learning_rate": 0.0002, "epoch": 1.6449853126311371, "step": 1960}, {"loss": 1.7306, "grad_norm": 0.38556376099586487, "learning_rate": 0.0002, "epoch": 1.653378094838439, "step": 1970}, {"loss": 1.7694, "grad_norm": 0.4114968776702881, "learning_rate": 0.0002, "epoch": 1.6617708770457407, "step": 1980}, {"loss": 1.6407, "grad_norm": 0.3665498197078705, "learning_rate": 0.0002, "epoch": 1.6701636592530424, "step": 1990}, {"loss": 1.7167, "grad_norm": 0.36579379439353943, "learning_rate": 0.0002, "epoch": 1.6785564414603442, "step": 2000}, {"loss": 1.7637, "grad_norm": 0.3813064694404602, "learning_rate": 0.0002, "epoch": 1.6869492236676458, "step": 2010}, {"loss": 1.7566, "grad_norm": 0.33390694856643677, "learning_rate": 0.0002, "epoch": 1.6953420058749475, "step": 2020}, {"loss": 1.6576, "grad_norm": 0.3668614327907562, "learning_rate": 0.0002, "epoch": 1.7037347880822493, "step": 2030}, {"loss": 1.7162, "grad_norm": 0.352028489112854, "learning_rate": 0.0002, "epoch": 1.7121275702895509, "step": 2040}, {"loss": 1.727, "grad_norm": 0.33639830350875854, "learning_rate": 0.0002, "epoch": 1.7205203524968526, "step": 2050}, {"loss": 1.7868, "grad_norm": 0.39217695593833923, "learning_rate": 0.0002, "epoch": 1.7289131347041544, "step": 2060}, {"loss": 1.7608, "grad_norm": 0.42593324184417725, "learning_rate": 0.0002, "epoch": 1.7373059169114562, "step": 2070}, {"loss": 1.722, "grad_norm": 0.362215518951416, "learning_rate": 0.0002, "epoch": 1.745698699118758, "step": 2080}, {"loss": 1.7712, "grad_norm": 0.4087955057621002, "learning_rate": 0.0002, "epoch": 1.7540914813260597, "step": 2090}, {"loss": 1.6414, "grad_norm": 0.35127750039100647, "learning_rate": 0.0002, "epoch": 1.7624842635333613, "step": 2100}, {"loss": 1.7405, "grad_norm": 0.33677494525909424, "learning_rate": 0.0002, "epoch": 1.770877045740663, "step": 2110}, {"loss": 1.7478, "grad_norm": 0.39616644382476807, "learning_rate": 0.0002, "epoch": 1.7792698279479646, "step": 2120}, {"loss": 1.8068, "grad_norm": 0.4705100953578949, "learning_rate": 0.0002, "epoch": 1.7876626101552664, "step": 2130}, {"loss": 1.75, "grad_norm": 0.3893914818763733, "learning_rate": 0.0002, "epoch": 1.7960553923625682, "step": 2140}, {"loss": 1.6711, "grad_norm": 0.3344813585281372, "learning_rate": 0.0002, "epoch": 1.80444817456987, "step": 2150}, {"loss": 1.8329, "grad_norm": 0.36502110958099365, "learning_rate": 0.0002, "epoch": 1.8128409567771717, "step": 2160}, {"loss": 1.753, "grad_norm": 0.3422985374927521, "learning_rate": 0.0002, "epoch": 1.8212337389844735, "step": 2170}, {"loss": 1.6874, "grad_norm": 0.44039851427078247, "learning_rate": 0.0002, "epoch": 1.829626521191775, "step": 2180}, {"loss": 1.7706, "grad_norm": 0.40052926540374756, "learning_rate": 0.0002, "epoch": 1.8380193033990768, "step": 2190}, {"loss": 1.7551, "grad_norm": 0.3614487648010254, "learning_rate": 0.0002, "epoch": 1.8464120856063784, "step": 2200}, {"loss": 1.6879, "grad_norm": 0.3800305426120758, "learning_rate": 0.0002, "epoch": 1.8548048678136801, "step": 2210}, {"loss": 1.7731, "grad_norm": 0.3942040205001831, "learning_rate": 0.0002, "epoch": 1.863197650020982, "step": 2220}, {"loss": 1.7187, "grad_norm": 0.36896875500679016, "learning_rate": 0.0002, "epoch": 1.8715904322282837, "step": 2230}, {"loss": 1.7371, "grad_norm": 0.3666089177131653, "learning_rate": 0.0002, "epoch": 1.8799832144355855, "step": 2240}, {"loss": 1.7336, "grad_norm": 0.3759142756462097, "learning_rate": 0.0002, "epoch": 1.8883759966428872, "step": 2250}, {"loss": 1.7243, "grad_norm": 0.3711695671081543, "learning_rate": 0.0002, "epoch": 1.8967687788501888, "step": 2260}, {"loss": 1.7052, "grad_norm": 0.37000006437301636, "learning_rate": 0.0002, "epoch": 1.9051615610574906, "step": 2270}, {"loss": 1.7104, "grad_norm": 0.37376025319099426, "learning_rate": 0.0002, "epoch": 1.9135543432647921, "step": 2280}, {"loss": 1.6641, "grad_norm": 0.3794068694114685, "learning_rate": 0.0002, "epoch": 1.921947125472094, "step": 2290}, {"loss": 1.7693, "grad_norm": 0.42530709505081177, "learning_rate": 0.0002, "epoch": 1.9303399076793957, "step": 2300}, {"loss": 1.7871, "grad_norm": 0.3381672203540802, "learning_rate": 0.0002, "epoch": 1.9387326898866974, "step": 2310}, {"loss": 1.7502, "grad_norm": 0.3553236722946167, "learning_rate": 0.0002, "epoch": 1.9471254720939992, "step": 2320}, {"loss": 1.715, "grad_norm": 0.38204774260520935, "learning_rate": 0.0002, "epoch": 1.955518254301301, "step": 2330}, {"loss": 1.7088, "grad_norm": 0.4318946301937103, "learning_rate": 0.0002, "epoch": 1.9639110365086025, "step": 2340}, {"loss": 1.7709, "grad_norm": 0.3563119173049927, "learning_rate": 0.0002, "epoch": 1.9723038187159043, "step": 2350}, {"loss": 1.7083, "grad_norm": 0.362532377243042, "learning_rate": 0.0002, "epoch": 1.980696600923206, "step": 2360}, {"loss": 1.6992, "grad_norm": 0.40200483798980713, "learning_rate": 0.0002, "epoch": 1.9890893831305076, "step": 2370}, {"loss": 1.7622, "grad_norm": 0.37397003173828125, "learning_rate": 0.0002, "epoch": 1.9974821653378094, "step": 2380}, {"eval_loss": 1.807437539100647, "eval_runtime": 38.0038, "eval_samples_per_second": 13.551, "eval_steps_per_second": 1.71, "epoch": 2.0, "step": 2383}, {"loss": 1.579, "grad_norm": 0.3563518226146698, "learning_rate": 0.0002, "epoch": 2.005874947545111, "step": 2390}, {"loss": 1.5467, "grad_norm": 0.3913732171058655, "learning_rate": 0.0002, "epoch": 2.014267729752413, "step": 2400}, {"loss": 1.6202, "grad_norm": 0.3511047661304474, "learning_rate": 0.0002, "epoch": 2.0226605119597147, "step": 2410}, {"loss": 1.599, "grad_norm": 0.3917897641658783, "learning_rate": 0.0002, "epoch": 2.0310532941670165, "step": 2420}, {"loss": 1.663, "grad_norm": 0.36766913533210754, "learning_rate": 0.0002, "epoch": 2.0394460763743183, "step": 2430}, {"loss": 1.5608, "grad_norm": 0.434097021818161, "learning_rate": 0.0002, "epoch": 2.0478388585816196, "step": 2440}, {"loss": 1.6199, "grad_norm": 0.4986756145954132, "learning_rate": 0.0002, "epoch": 2.0562316407889214, "step": 2450}, {"loss": 1.6224, "grad_norm": 0.4377020001411438, "learning_rate": 0.0002, "epoch": 2.064624422996223, "step": 2460}, {"loss": 1.6047, "grad_norm": 0.4412095546722412, "learning_rate": 0.0002, "epoch": 2.073017205203525, "step": 2470}, {"loss": 1.6766, "grad_norm": 0.4463737905025482, "learning_rate": 0.0002, "epoch": 2.0814099874108267, "step": 2480}, {"loss": 1.6666, "grad_norm": 0.4118853211402893, "learning_rate": 0.0002, "epoch": 2.0898027696181285, "step": 2490}, {"loss": 1.6384, "grad_norm": 0.48814308643341064, "learning_rate": 0.0002, "epoch": 2.0981955518254303, "step": 2500}, {"loss": 1.6292, "grad_norm": 0.4263038635253906, "learning_rate": 0.0002, "epoch": 2.106588334032732, "step": 2510}, {"loss": 1.5907, "grad_norm": 0.41060999035835266, "learning_rate": 0.0002, "epoch": 2.1149811162400334, "step": 2520}, {"loss": 1.685, "grad_norm": 0.4699285626411438, "learning_rate": 0.0002, "epoch": 2.123373898447335, "step": 2530}, {"loss": 1.6076, "grad_norm": 0.4321298897266388, "learning_rate": 0.0002, "epoch": 2.131766680654637, "step": 2540}, {"loss": 1.5715, "grad_norm": 0.41544368863105774, "learning_rate": 0.0002, "epoch": 2.1401594628619387, "step": 2550}, {"loss": 1.6717, "grad_norm": 0.4529191851615906, "learning_rate": 0.0002, "epoch": 2.1485522450692405, "step": 2560}, {"loss": 1.7014, "grad_norm": 0.4370215833187103, "learning_rate": 0.0002, "epoch": 2.1569450272765422, "step": 2570}, {"loss": 1.55, "grad_norm": 0.3878629207611084, "learning_rate": 0.0002, "epoch": 2.165337809483844, "step": 2580}, {"loss": 1.6863, "grad_norm": 0.47374191880226135, "learning_rate": 0.0002, "epoch": 2.173730591691146, "step": 2590}, {"loss": 1.6462, "grad_norm": 0.4551556706428528, "learning_rate": 0.0002, "epoch": 2.182123373898447, "step": 2600}, {"loss": 1.6238, "grad_norm": 0.45371633768081665, "learning_rate": 0.0002, "epoch": 2.190516156105749, "step": 2610}, {"loss": 1.6134, "grad_norm": 0.3831859529018402, "learning_rate": 0.0002, "epoch": 2.1989089383130507, "step": 2620}, {"loss": 1.6477, "grad_norm": 0.42436569929122925, "learning_rate": 0.0002, "epoch": 2.2073017205203525, "step": 2630}, {"loss": 1.6512, "grad_norm": 0.4363750219345093, "learning_rate": 0.0002, "epoch": 2.2156945027276542, "step": 2640}, {"loss": 1.6978, "grad_norm": 0.4473390579223633, "learning_rate": 0.0002, "epoch": 2.224087284934956, "step": 2650}, {"loss": 1.6161, "grad_norm": 0.4419533908367157, "learning_rate": 0.0002, "epoch": 2.2324800671422578, "step": 2660}, {"loss": 1.6415, "grad_norm": 0.525901198387146, "learning_rate": 0.0002, "epoch": 2.2408728493495595, "step": 2670}, {"loss": 1.6891, "grad_norm": 0.4345211684703827, "learning_rate": 0.0002, "epoch": 2.2492656315568613, "step": 2680}, {"loss": 1.5951, "grad_norm": 0.5169841051101685, "learning_rate": 0.0002, "epoch": 2.2576584137641627, "step": 2690}, {"loss": 1.6221, "grad_norm": 0.43511003255844116, "learning_rate": 0.0002, "epoch": 2.2660511959714644, "step": 2700}, {"loss": 1.6084, "grad_norm": 0.4781411588191986, "learning_rate": 0.0002, "epoch": 2.274443978178766, "step": 2710}, {"loss": 1.6292, "grad_norm": 0.4282242953777313, "learning_rate": 0.0002, "epoch": 2.282836760386068, "step": 2720}, {"loss": 1.5238, "grad_norm": 0.4499875605106354, "learning_rate": 0.0002, "epoch": 2.2912295425933698, "step": 2730}, {"loss": 1.5844, "grad_norm": 0.4133218824863434, "learning_rate": 0.0002, "epoch": 2.2996223248006715, "step": 2740}, {"loss": 1.6207, "grad_norm": 0.4706156849861145, "learning_rate": 0.0002, "epoch": 2.3080151070079733, "step": 2750}, {"loss": 1.573, "grad_norm": 0.4537484347820282, "learning_rate": 0.0002, "epoch": 2.3164078892152746, "step": 2760}, {"loss": 1.6556, "grad_norm": 0.39736735820770264, "learning_rate": 0.0002, "epoch": 2.3248006714225764, "step": 2770}, {"loss": 1.7032, "grad_norm": 0.4488453269004822, "learning_rate": 0.0002, "epoch": 2.333193453629878, "step": 2780}, {"loss": 1.6169, "grad_norm": 0.44405487179756165, "learning_rate": 0.0002, "epoch": 2.34158623583718, "step": 2790}, {"loss": 1.5207, "grad_norm": 0.4726555049419403, "learning_rate": 0.0002, "epoch": 2.3499790180444817, "step": 2800}, {"loss": 1.5792, "grad_norm": 0.4820375442504883, "learning_rate": 0.0002, "epoch": 2.3583718002517835, "step": 2810}, {"loss": 1.5774, "grad_norm": 0.46176597476005554, "learning_rate": 0.0002, "epoch": 2.3667645824590853, "step": 2820}, {"loss": 1.6256, "grad_norm": 0.4603394567966461, "learning_rate": 0.0002, "epoch": 2.375157364666387, "step": 2830}, {"loss": 1.6598, "grad_norm": 0.4462946355342865, "learning_rate": 0.0002, "epoch": 2.383550146873689, "step": 2840}, {"loss": 1.5939, "grad_norm": 0.5216080546379089, "learning_rate": 0.0002, "epoch": 2.39194292908099, "step": 2850}, {"loss": 1.5981, "grad_norm": 0.44553086161613464, "learning_rate": 0.0002, "epoch": 2.400335711288292, "step": 2860}, {"loss": 1.6556, "grad_norm": 0.4215725362300873, "learning_rate": 0.0002, "epoch": 2.4087284934955937, "step": 2870}, {"loss": 1.6228, "grad_norm": 0.4646450877189636, "learning_rate": 0.0002, "epoch": 2.4171212757028955, "step": 2880}, {"loss": 1.6547, "grad_norm": 0.44749370217323303, "learning_rate": 0.0002, "epoch": 2.4255140579101973, "step": 2890}, {"loss": 1.6356, "grad_norm": 0.4986693859100342, "learning_rate": 0.0002, "epoch": 2.433906840117499, "step": 2900}, {"loss": 1.6294, "grad_norm": 0.4607609808444977, "learning_rate": 0.0002, "epoch": 2.442299622324801, "step": 2910}, {"loss": 1.6721, "grad_norm": 0.4597654938697815, "learning_rate": 0.0002, "epoch": 2.4506924045321026, "step": 2920}, {"loss": 1.7428, "grad_norm": 0.4106820821762085, "learning_rate": 0.0002, "epoch": 2.4590851867394043, "step": 2930}, {"loss": 1.622, "grad_norm": 0.4531514048576355, "learning_rate": 0.0002, "epoch": 2.4674779689467057, "step": 2940}, {"loss": 1.6367, "grad_norm": 0.4546769857406616, "learning_rate": 0.0002, "epoch": 2.4758707511540075, "step": 2950}, {"loss": 1.6306, "grad_norm": 0.47410622239112854, "learning_rate": 0.0002, "epoch": 2.4842635333613092, "step": 2960}, {"loss": 1.6597, "grad_norm": 0.4498177468776703, "learning_rate": 0.0002, "epoch": 2.492656315568611, "step": 2970}, {"loss": 1.6845, "grad_norm": 0.47267791628837585, "learning_rate": 0.0002, "epoch": 2.5010490977759128, "step": 2980}, {"loss": 1.601, "grad_norm": 0.4340207576751709, "learning_rate": 0.0002, "epoch": 2.5094418799832146, "step": 2990}, {"loss": 1.5783, "grad_norm": 0.43454936146736145, "learning_rate": 0.0002, "epoch": 2.5178346621905163, "step": 3000}, {"loss": 1.5773, "grad_norm": 0.43459394574165344, "learning_rate": 0.0002, "epoch": 2.5262274443978177, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4716770052909851, "learning_rate": 0.0002, "epoch": 2.5346202266051194, "step": 3020}, {"loss": 1.626, "grad_norm": 0.4339194595813751, "learning_rate": 0.0002, "epoch": 2.543013008812421, "step": 3030}, {"loss": 1.6053, "grad_norm": 0.4655593931674957, "learning_rate": 0.0002, "epoch": 2.551405791019723, "step": 3040}, {"loss": 1.5871, "grad_norm": 0.5480475425720215, "learning_rate": 0.0002, "epoch": 2.5597985732270248, "step": 3050}, {"loss": 1.7056, "grad_norm": 0.4783174991607666, "learning_rate": 0.0002, "epoch": 2.5681913554343265, "step": 3060}, {"loss": 1.5691, "grad_norm": 0.45062026381492615, "learning_rate": 0.0002, "epoch": 2.5765841376416283, "step": 3070}, {"loss": 1.7005, "grad_norm": 0.4559392035007477, "learning_rate": 0.0002, "epoch": 2.58497691984893, "step": 3080}, {"loss": 1.6414, "grad_norm": 0.6581618785858154, "learning_rate": 0.0002, "epoch": 2.593369702056232, "step": 3090}, {"loss": 1.6707, "grad_norm": 0.48549333214759827, "learning_rate": 0.0002, "epoch": 2.601762484263533, "step": 3100}, {"loss": 1.6128, "grad_norm": 0.5358436107635498, "learning_rate": 0.0002, "epoch": 2.610155266470835, "step": 3110}, {"loss": 1.6507, "grad_norm": 0.5380043983459473, "learning_rate": 0.0002, "epoch": 2.6185480486781367, "step": 3120}, {"loss": 1.6394, "grad_norm": 0.49887847900390625, "learning_rate": 0.0002, "epoch": 2.6269408308854385, "step": 3130}, {"loss": 1.6464, "grad_norm": 0.46039602160453796, "learning_rate": 0.0002, "epoch": 2.6353336130927403, "step": 3140}, {"loss": 1.6337, "grad_norm": 0.416098952293396, "learning_rate": 0.0002, "epoch": 2.643726395300042, "step": 3150}, {"loss": 1.6295, "grad_norm": 0.465326726436615, "learning_rate": 0.0002, "epoch": 2.652119177507344, "step": 3160}, {"loss": 1.5806, "grad_norm": 0.47029924392700195, "learning_rate": 0.0002, "epoch": 2.660511959714645, "step": 3170}, {"loss": 1.6268, "grad_norm": 0.5063307285308838, "learning_rate": 0.0002, "epoch": 2.6689047419219474, "step": 3180}, {"loss": 1.5718, "grad_norm": 0.42928868532180786, "learning_rate": 0.0002, "epoch": 2.6772975241292487, "step": 3190}, {"loss": 1.6113, "grad_norm": 0.4170134365558624, "learning_rate": 0.0002, "epoch": 2.6856903063365505, "step": 3200}, {"loss": 1.6337, "grad_norm": 0.47810474038124084, "learning_rate": 0.0002, "epoch": 2.6940830885438523, "step": 3210}, {"loss": 1.6808, "grad_norm": 0.44440609216690063, "learning_rate": 0.0002, "epoch": 2.702475870751154, "step": 3220}, {"loss": 1.5611, "grad_norm": 0.482759565114975, "learning_rate": 0.0002, "epoch": 2.710868652958456, "step": 3230}, {"loss": 1.6265, "grad_norm": 0.4325942099094391, "learning_rate": 0.0002, "epoch": 2.7192614351657576, "step": 3240}, {"loss": 1.585, "grad_norm": 0.502498984336853, "learning_rate": 0.0002, "epoch": 2.7276542173730594, "step": 3250}, {"loss": 1.7179, "grad_norm": 0.4725162982940674, "learning_rate": 0.0002, "epoch": 2.7360469995803607, "step": 3260}, {"loss": 1.6591, "grad_norm": 0.46781349182128906, "learning_rate": 0.0002, "epoch": 2.7444397817876625, "step": 3270}, {"loss": 1.6625, "grad_norm": 0.47366851568222046, "learning_rate": 0.0002, "epoch": 2.7528325639949642, "step": 3280}, {"loss": 1.6437, "grad_norm": 0.5101882815361023, "learning_rate": 0.0002, "epoch": 2.761225346202266, "step": 3290}, {"loss": 1.6488, "grad_norm": 0.4874587059020996, "learning_rate": 0.0002, "epoch": 2.769618128409568, "step": 3300}, {"loss": 1.6151, "grad_norm": 0.4989369213581085, "learning_rate": 0.0002, "epoch": 2.7780109106168696, "step": 3310}, {"loss": 1.6786, "grad_norm": 0.48041442036628723, "learning_rate": 0.0002, "epoch": 2.7864036928241713, "step": 3320}, {"loss": 1.6137, "grad_norm": 0.4845651090145111, "learning_rate": 0.0002, "epoch": 2.7947964750314727, "step": 3330}, {"loss": 1.7154, "grad_norm": 0.48575496673583984, "learning_rate": 0.0002, "epoch": 2.803189257238775, "step": 3340}, {"loss": 1.6771, "grad_norm": 0.509726881980896, "learning_rate": 0.0002, "epoch": 2.811582039446076, "step": 3350}, {"loss": 1.6937, "grad_norm": 0.5026665329933167, "learning_rate": 0.0002, "epoch": 2.819974821653378, "step": 3360}, {"loss": 1.623, "grad_norm": 0.4727601706981659, "learning_rate": 0.0002, "epoch": 2.8283676038606798, "step": 3370}, {"loss": 1.6811, "grad_norm": 0.41952234506607056, "learning_rate": 0.0002, "epoch": 2.8367603860679815, "step": 3380}, {"loss": 1.6639, "grad_norm": 0.49663856625556946, "learning_rate": 0.0002, "epoch": 2.8451531682752833, "step": 3390}, {"loss": 1.6389, "grad_norm": 0.4934511184692383, "learning_rate": 0.0002, "epoch": 2.853545950482585, "step": 3400}, {"loss": 1.6362, "grad_norm": 0.4673226773738861, "learning_rate": 0.0002, "epoch": 2.861938732689887, "step": 3410}, {"loss": 1.641, "grad_norm": 0.48972779512405396, "learning_rate": 0.0002, "epoch": 2.870331514897188, "step": 3420}, {"loss": 1.6047, "grad_norm": 0.5008330345153809, "learning_rate": 0.0002, "epoch": 2.8787242971044904, "step": 3430}, {"loss": 1.6867, "grad_norm": 0.43337664008140564, "learning_rate": 0.0002, "epoch": 2.8871170793117917, "step": 3440}, {"loss": 1.5501, "grad_norm": 0.4430622458457947, "learning_rate": 0.0002, "epoch": 2.8955098615190935, "step": 3450}, {"loss": 1.6415, "grad_norm": 0.45123326778411865, "learning_rate": 0.0002, "epoch": 2.9039026437263953, "step": 3460}, {"loss": 1.5913, "grad_norm": 0.47367340326309204, "learning_rate": 0.0002, "epoch": 2.912295425933697, "step": 3470}, {"loss": 1.5951, "grad_norm": 0.44940701127052307, "learning_rate": 0.0002, "epoch": 2.920688208140999, "step": 3480}, {"loss": 1.6343, "grad_norm": 0.44216281175613403, "learning_rate": 0.0002, "epoch": 2.9290809903483006, "step": 3490}, {"loss": 1.6088, "grad_norm": 0.4824782609939575, "learning_rate": 0.0002, "epoch": 2.9374737725556024, "step": 3500}, {"loss": 1.5949, "grad_norm": 0.43067067861557007, "learning_rate": 0.0002, "epoch": 2.9458665547629037, "step": 3510}, {"loss": 1.547, "grad_norm": 0.46483176946640015, "learning_rate": 0.0002, "epoch": 2.9542593369702055, "step": 3520}, {"loss": 1.5878, "grad_norm": 0.49230799078941345, "learning_rate": 0.0002, "epoch": 2.9626521191775073, "step": 3530}, {"loss": 1.5925, "grad_norm": 0.5081011652946472, "learning_rate": 0.0002, "epoch": 2.971044901384809, "step": 3540}, {"loss": 1.7402, "grad_norm": 0.5326072573661804, "learning_rate": 0.0002, "epoch": 2.979437683592111, "step": 3550}, {"loss": 1.5769, "grad_norm": 0.4981454014778137, "learning_rate": 0.0002, "epoch": 2.9878304657994126, "step": 3560}, {"loss": 1.6073, "grad_norm": 0.4330528676509857, "learning_rate": 0.0002, "epoch": 2.9962232480067144, "step": 3570}, {"eval_loss": 1.824695348739624, "eval_runtime": 37.947, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.999580360889635, "step": 3574}, {"loss": 1.5633, "grad_norm": 0.4380604326725006, "learning_rate": 0.0002, "epoch": 3.004616030214016, "step": 3580}, {"loss": 1.4474, "grad_norm": 0.5375564098358154, "learning_rate": 0.0002, "epoch": 3.0130088124213175, "step": 3590}, {"loss": 1.5738, "grad_norm": 0.50722736120224, "learning_rate": 0.0002, "epoch": 3.0214015946286192, "step": 3600}, {"loss": 1.5191, "grad_norm": 0.5398766994476318, "learning_rate": 0.0002, "epoch": 3.029794376835921, "step": 3610}, {"loss": 1.4401, "grad_norm": 0.520709753036499, "learning_rate": 0.0002, "epoch": 3.038187159043223, "step": 3620}, {"loss": 1.5704, "grad_norm": 0.5429664850234985, "learning_rate": 0.0002, "epoch": 3.0465799412505246, "step": 3630}, {"loss": 1.5516, "grad_norm": 0.5634943842887878, "learning_rate": 0.0002, "epoch": 3.0549727234578263, "step": 3640}, {"loss": 1.5349, "grad_norm": 0.5042277574539185, "learning_rate": 0.0002, "epoch": 3.063365505665128, "step": 3650}, {"loss": 1.4708, "grad_norm": 0.5778711438179016, "learning_rate": 0.0002, "epoch": 3.07175828787243, "step": 3660}, {"loss": 1.5196, "grad_norm": 0.5504926443099976, "learning_rate": 0.0002, "epoch": 3.080151070079731, "step": 3670}, {"loss": 1.473, "grad_norm": 0.5199463963508606, "learning_rate": 0.0002, "epoch": 3.088543852287033, "step": 3680}, {"loss": 1.5064, "grad_norm": 0.552334189414978, "learning_rate": 0.0002, "epoch": 3.0969366344943348, "step": 3690}, {"loss": 1.4638, "grad_norm": 0.5650873780250549, "learning_rate": 0.0002, "epoch": 3.1053294167016365, "step": 3700}, {"loss": 1.4945, "grad_norm": 0.6292349696159363, "learning_rate": 0.0002, "epoch": 3.1137221989089383, "step": 3710}, {"loss": 1.4787, "grad_norm": 0.5523604154586792, "learning_rate": 0.0002, "epoch": 3.12211498111624, "step": 3720}, {"loss": 1.4697, "grad_norm": 0.6160100698471069, "learning_rate": 0.0002, "epoch": 3.130507763323542, "step": 3730}, {"loss": 1.5589, "grad_norm": 0.6091629266738892, "learning_rate": 0.0002, "epoch": 3.1389005455308436, "step": 3740}, {"loss": 1.4659, "grad_norm": 0.5695531964302063, "learning_rate": 0.0002, "epoch": 3.1472933277381454, "step": 3750}, {"loss": 1.4605, "grad_norm": 0.569611132144928, "learning_rate": 0.0002, "epoch": 3.1556861099454467, "step": 3760}, {"loss": 1.4592, "grad_norm": 0.5761140584945679, "learning_rate": 0.0002, "epoch": 3.1640788921527485, "step": 3770}, {"loss": 1.4999, "grad_norm": 0.6855548620223999, "learning_rate": 0.0002, "epoch": 3.1724716743600503, "step": 3780}, {"loss": 1.5047, "grad_norm": 0.5815101265907288, "learning_rate": 0.0002, "epoch": 3.180864456567352, "step": 3790}, {"loss": 1.5289, "grad_norm": 0.6179960370063782, "learning_rate": 0.0002, "epoch": 3.189257238774654, "step": 3800}, {"loss": 1.4833, "grad_norm": 0.5418674349784851, "learning_rate": 0.0002, "epoch": 3.1976500209819556, "step": 3810}, {"loss": 1.4994, "grad_norm": 0.5655816197395325, "learning_rate": 0.0002, "epoch": 3.2060428031892574, "step": 3820}, {"loss": 1.5007, "grad_norm": 0.7279291152954102, "learning_rate": 0.0002, "epoch": 3.214435585396559, "step": 3830}, {"loss": 1.5672, "grad_norm": 0.490998238325119, "learning_rate": 0.0002, "epoch": 3.2228283676038605, "step": 3840}, {"loss": 1.4683, "grad_norm": 0.6065797209739685, "learning_rate": 0.0002, "epoch": 3.2312211498111623, "step": 3850}, {"loss": 1.5153, "grad_norm": 0.6024682521820068, "learning_rate": 0.0002, "epoch": 3.239613932018464, "step": 3860}, {"loss": 1.5123, "grad_norm": 0.5571125745773315, "learning_rate": 0.0002, "epoch": 3.248006714225766, "step": 3870}, {"loss": 1.4609, "grad_norm": 0.5662134289741516, "learning_rate": 0.0002, "epoch": 3.2563994964330676, "step": 3880}, {"loss": 1.5452, "grad_norm": 0.5936661958694458, "learning_rate": 0.0002, "epoch": 3.2647922786403694, "step": 3890}, {"loss": 1.5149, "grad_norm": 0.6739671230316162, "learning_rate": 0.0002, "epoch": 3.273185060847671, "step": 3900}, {"loss": 1.5101, "grad_norm": 0.5579532384872437, "learning_rate": 0.0002, "epoch": 3.281577843054973, "step": 3910}, {"loss": 1.4788, "grad_norm": 0.6595954298973083, "learning_rate": 0.0002, "epoch": 3.2899706252622742, "step": 3920}, {"loss": 1.473, "grad_norm": 0.5712262988090515, "learning_rate": 0.0002, "epoch": 3.298363407469576, "step": 3930}, {"loss": 1.5512, "grad_norm": 0.5601761341094971, "learning_rate": 0.0002, "epoch": 3.306756189676878, "step": 3940}, {"loss": 1.4904, "grad_norm": 0.5759967565536499, "learning_rate": 0.0002, "epoch": 3.3151489718841796, "step": 3950}, {"loss": 1.4885, "grad_norm": 0.6543047428131104, "learning_rate": 0.0002, "epoch": 3.3235417540914813, "step": 3960}, {"loss": 1.5063, "grad_norm": 0.6355253458023071, "learning_rate": 0.0002, "epoch": 3.331934536298783, "step": 3970}, {"loss": 1.5025, "grad_norm": 0.5671007633209229, "learning_rate": 0.0002, "epoch": 3.340327318506085, "step": 3980}, {"loss": 1.5049, "grad_norm": 0.6743636727333069, "learning_rate": 0.0002, "epoch": 3.3487201007133867, "step": 3990}, {"loss": 1.5527, "grad_norm": 0.500627338886261, "learning_rate": 0.0002, "epoch": 3.3571128829206884, "step": 4000}, {"loss": 1.4884, "grad_norm": 0.5666340589523315, "learning_rate": 0.0002, "epoch": 3.3655056651279898, "step": 4010}, {"loss": 1.5104, "grad_norm": 0.5651408433914185, "learning_rate": 0.0002, "epoch": 3.3738984473352915, "step": 4020}, {"loss": 1.4907, "grad_norm": 0.6338897943496704, "learning_rate": 0.0002, "epoch": 3.3822912295425933, "step": 4030}, {"loss": 1.553, "grad_norm": 0.5781935453414917, "learning_rate": 0.0002, "epoch": 3.390684011749895, "step": 4040}, {"loss": 1.5535, "grad_norm": 0.55543053150177, "learning_rate": 0.0002, "epoch": 3.399076793957197, "step": 4050}, {"loss": 1.4884, "grad_norm": 0.6602614521980286, "learning_rate": 0.0002, "epoch": 3.4074695761644986, "step": 4060}, {"loss": 1.471, "grad_norm": 0.5514156222343445, "learning_rate": 0.0002, "epoch": 3.4158623583718004, "step": 4070}, {"loss": 1.4634, "grad_norm": 0.5760560035705566, "learning_rate": 0.0002, "epoch": 3.4242551405791017, "step": 4080}, {"loss": 1.4662, "grad_norm": 0.657503604888916, "learning_rate": 0.0002, "epoch": 3.4326479227864035, "step": 4090}, {"loss": 1.5041, "grad_norm": 0.5746736526489258, "learning_rate": 0.0002, "epoch": 3.4410407049937053, "step": 4100}, {"loss": 1.4387, "grad_norm": 0.5988999009132385, "learning_rate": 0.0002, "epoch": 3.449433487201007, "step": 4110}, {"loss": 1.5475, "grad_norm": 0.7294586300849915, "learning_rate": 0.0002, "epoch": 3.457826269408309, "step": 4120}, {"loss": 1.4878, "grad_norm": 0.6391161680221558, "learning_rate": 0.0002, "epoch": 3.4662190516156106, "step": 4130}, {"loss": 1.5366, "grad_norm": 0.6416470408439636, "learning_rate": 0.0002, "epoch": 3.4746118338229124, "step": 4140}, {"loss": 1.5587, "grad_norm": 0.5710626244544983, "learning_rate": 0.0002, "epoch": 3.483004616030214, "step": 4150}, {"loss": 1.4661, "grad_norm": 0.5370054841041565, "learning_rate": 0.0002, "epoch": 3.491397398237516, "step": 4160}, {"loss": 1.5167, "grad_norm": 0.5559558272361755, "learning_rate": 0.0002, "epoch": 3.4997901804448173, "step": 4170}, {"loss": 1.4244, "grad_norm": 0.5426168441772461, "learning_rate": 0.0002, "epoch": 3.508182962652119, "step": 4180}, {"loss": 1.5241, "grad_norm": 0.5997438430786133, "learning_rate": 0.0002, "epoch": 3.516575744859421, "step": 4190}, {"loss": 1.6091, "grad_norm": 0.5399143099784851, "learning_rate": 0.0002, "epoch": 3.5249685270667226, "step": 4200}, {"loss": 1.5066, "grad_norm": 0.6341416239738464, "learning_rate": 0.0002, "epoch": 3.5333613092740244, "step": 4210}, {"loss": 1.5436, "grad_norm": 0.632238507270813, "learning_rate": 0.0002, "epoch": 3.541754091481326, "step": 4220}, {"loss": 1.5423, "grad_norm": 0.6356478333473206, "learning_rate": 0.0002, "epoch": 3.550146873688628, "step": 4230}, {"loss": 1.483, "grad_norm": 0.6379408240318298, "learning_rate": 0.0002, "epoch": 3.5585396558959292, "step": 4240}, {"loss": 1.5184, "grad_norm": 0.6265586018562317, "learning_rate": 0.0002, "epoch": 3.5669324381032315, "step": 4250}, {"loss": 1.5047, "grad_norm": 0.5378820896148682, "learning_rate": 0.0002, "epoch": 3.575325220310533, "step": 4260}, {"loss": 1.5668, "grad_norm": 0.6800801753997803, "learning_rate": 0.0002, "epoch": 3.5837180025178346, "step": 4270}, {"loss": 1.5363, "grad_norm": 0.5653113126754761, "learning_rate": 0.0002, "epoch": 3.5921107847251363, "step": 4280}, {"loss": 1.5007, "grad_norm": 0.548647940158844, "learning_rate": 0.0002, "epoch": 3.600503566932438, "step": 4290}, {"loss": 1.5034, "grad_norm": 0.5729944705963135, "learning_rate": 0.0002, "epoch": 3.60889634913974, "step": 4300}, {"loss": 1.575, "grad_norm": 0.6204999685287476, "learning_rate": 0.0002, "epoch": 3.6172891313470417, "step": 4310}, {"loss": 1.5107, "grad_norm": 0.6275812983512878, "learning_rate": 0.0002, "epoch": 3.6256819135543434, "step": 4320}, {"loss": 1.5013, "grad_norm": 0.7261835336685181, "learning_rate": 0.0002, "epoch": 3.6340746957616448, "step": 4330}, {"loss": 1.5128, "grad_norm": 0.6048004627227783, "learning_rate": 0.0002, "epoch": 3.6424674779689465, "step": 4340}, {"loss": 1.5106, "grad_norm": 0.5879671573638916, "learning_rate": 0.0002, "epoch": 3.6508602601762483, "step": 4350}, {"loss": 1.5477, "grad_norm": 0.6001018285751343, "learning_rate": 0.0002, "epoch": 3.65925304238355, "step": 4360}, {"loss": 1.5247, "grad_norm": 0.6468151211738586, "learning_rate": 0.0002, "epoch": 3.667645824590852, "step": 4370}, {"loss": 1.563, "grad_norm": 0.6342051029205322, "learning_rate": 0.0002, "epoch": 3.6760386067981536, "step": 4380}, {"loss": 1.5444, "grad_norm": 0.6078384518623352, "learning_rate": 0.0002, "epoch": 3.6844313890054554, "step": 4390}, {"loss": 1.5546, "grad_norm": 0.5555588006973267, "learning_rate": 0.0002, "epoch": 3.692824171212757, "step": 4400}, {"loss": 1.5694, "grad_norm": 0.6089665293693542, "learning_rate": 0.0002, "epoch": 3.701216953420059, "step": 4410}, {"loss": 1.5898, "grad_norm": 0.6225191950798035, "learning_rate": 0.0002, "epoch": 3.7096097356273603, "step": 4420}, {"loss": 1.5153, "grad_norm": 0.5642715692520142, "learning_rate": 0.0002, "epoch": 3.718002517834662, "step": 4430}, {"loss": 1.5057, "grad_norm": 0.5703449845314026, "learning_rate": 0.0002, "epoch": 3.726395300041964, "step": 4440}, {"loss": 1.5451, "grad_norm": 0.6029745936393738, "learning_rate": 0.0002, "epoch": 3.7347880822492656, "step": 4450}, {"loss": 1.5044, "grad_norm": 0.7089189887046814, "learning_rate": 0.0002, "epoch": 3.7431808644565674, "step": 4460}, {"loss": 1.4804, "grad_norm": 0.6230936050415039, "learning_rate": 0.0002, "epoch": 3.751573646663869, "step": 4470}, {"loss": 1.567, "grad_norm": 0.5718494653701782, "learning_rate": 0.0002, "epoch": 3.759966428871171, "step": 4480}, {"loss": 1.5612, "grad_norm": 0.5404117703437805, "learning_rate": 0.0002, "epoch": 3.7683592110784723, "step": 4490}, {"loss": 1.4707, "grad_norm": 0.5816529393196106, "learning_rate": 0.0002, "epoch": 3.7767519932857745, "step": 4500}, {"loss": 1.5802, "grad_norm": 0.6314901113510132, "learning_rate": 0.0002, "epoch": 3.785144775493076, "step": 4510}, {"loss": 1.5445, "grad_norm": 0.7639698386192322, "learning_rate": 0.0002, "epoch": 3.7935375577003776, "step": 4520}, {"loss": 1.5718, "grad_norm": 0.5727366209030151, "learning_rate": 0.0002, "epoch": 3.8019303399076794, "step": 4530}, {"loss": 1.5409, "grad_norm": 0.6467128396034241, "learning_rate": 0.0002, "epoch": 3.810323122114981, "step": 4540}, {"loss": 1.5266, "grad_norm": 0.6572837233543396, "learning_rate": 0.0002, "epoch": 3.818715904322283, "step": 4550}, {"loss": 1.5718, "grad_norm": 0.5847418904304504, "learning_rate": 0.0002, "epoch": 3.8271086865295847, "step": 4560}, {"loss": 1.5303, "grad_norm": 0.48820871114730835, "learning_rate": 0.0002, "epoch": 3.8355014687368865, "step": 4570}, {"loss": 1.4911, "grad_norm": 1.2537429332733154, "learning_rate": 0.0002, "epoch": 3.843894250944188, "step": 4580}, {"loss": 1.5522, "grad_norm": 0.6026989221572876, "learning_rate": 0.0002, "epoch": 3.8522870331514896, "step": 4590}, {"loss": 1.5035, "grad_norm": 0.5541417598724365, "learning_rate": 0.0002, "epoch": 3.8606798153587913, "step": 4600}, {"loss": 1.5238, "grad_norm": 0.7668771147727966, "learning_rate": 0.0002, "epoch": 3.869072597566093, "step": 4610}, {"loss": 1.5428, "grad_norm": 0.6181227564811707, "learning_rate": 0.0002, "epoch": 3.877465379773395, "step": 4620}, {"loss": 1.5242, "grad_norm": 0.5842700004577637, "learning_rate": 0.0002, "epoch": 3.8858581619806967, "step": 4630}, {"loss": 1.5501, "grad_norm": 0.5824751257896423, "learning_rate": 0.0002, "epoch": 3.8942509441879984, "step": 4640}, {"loss": 1.4443, "grad_norm": 0.6212735772132874, "learning_rate": 0.0002, "epoch": 3.9026437263952998, "step": 4650}, {"loss": 1.4972, "grad_norm": 0.6123346090316772, "learning_rate": 0.0002, "epoch": 3.911036508602602, "step": 4660}, {"loss": 1.5531, "grad_norm": 0.518662691116333, "learning_rate": 0.0002, "epoch": 3.9194292908099033, "step": 4670}, {"loss": 1.5151, "grad_norm": 0.6963476538658142, "learning_rate": 0.0002, "epoch": 3.927822073017205, "step": 4680}, {"loss": 1.5826, "grad_norm": 0.5192152261734009, "learning_rate": 0.0002, "epoch": 3.936214855224507, "step": 4690}, {"loss": 1.5312, "grad_norm": 0.5820888876914978, "learning_rate": 0.0002, "epoch": 3.9446076374318086, "step": 4700}, {"loss": 1.527, "grad_norm": 0.6320387721061707, "learning_rate": 0.0002, "epoch": 3.9530004196391104, "step": 4710}, {"loss": 1.6006, "grad_norm": 0.6174548268318176, "learning_rate": 0.0002, "epoch": 3.961393201846412, "step": 4720}, {"loss": 1.5581, "grad_norm": 0.6691966652870178, "learning_rate": 0.0002, "epoch": 3.969785984053714, "step": 4730}, {"loss": 1.4762, "grad_norm": 0.5972068309783936, "learning_rate": 0.0002, "epoch": 3.9781787662610153, "step": 4740}, {"loss": 1.4947, "grad_norm": 0.5759536027908325, "learning_rate": 0.0002, "epoch": 3.9865715484683175, "step": 4750}, {"loss": 1.4836, "grad_norm": 0.5886756777763367, "learning_rate": 0.0002, "epoch": 3.994964330675619, "step": 4760}, {"eval_loss": 1.8749940395355225, "eval_runtime": 38.037, "eval_samples_per_second": 13.539, "eval_steps_per_second": 1.709, "epoch": 4.0, "step": 4766}, {"loss": 1.5259, "grad_norm": 0.5915011167526245, "learning_rate": 0.0002, "epoch": 4.003357112882921, "step": 4770}, {"loss": 1.4071, "grad_norm": 0.8565000891685486, "learning_rate": 0.0002, "epoch": 4.011749895090222, "step": 4780}, {"loss": 1.3211, "grad_norm": 0.7753950953483582, "learning_rate": 0.0002, "epoch": 4.020142677297524, "step": 4790}, {"loss": 1.3607, "grad_norm": 0.6837254166603088, "learning_rate": 0.0002, "epoch": 4.028535459504826, "step": 4800}, {"loss": 1.3275, "grad_norm": 0.8374526500701904, "learning_rate": 0.0002, "epoch": 4.036928241712127, "step": 4810}, {"loss": 1.3579, "grad_norm": 0.8717963099479675, "learning_rate": 0.0002, "epoch": 4.0453210239194295, "step": 4820}, {"loss": 1.3374, "grad_norm": 0.7002043724060059, "learning_rate": 0.0002, "epoch": 4.053713806126731, "step": 4830}, {"loss": 1.3882, "grad_norm": 1.0319572687149048, "learning_rate": 0.0002, "epoch": 4.062106588334033, "step": 4840}, {"loss": 1.3291, "grad_norm": 0.6746882200241089, "learning_rate": 0.0002, "epoch": 4.070499370541334, "step": 4850}, {"loss": 1.339, "grad_norm": 0.8187578320503235, "learning_rate": 0.0002, "epoch": 4.078892152748637, "step": 4860}, {"loss": 1.368, "grad_norm": 0.7888399362564087, "learning_rate": 0.0002, "epoch": 4.087284934955938, "step": 4870}, {"loss": 1.4115, "grad_norm": 0.7149351239204407, "learning_rate": 0.0002, "epoch": 4.095677717163239, "step": 4880}, {"loss": 1.341, "grad_norm": 0.9067983031272888, "learning_rate": 0.0002, "epoch": 4.1040704993705415, "step": 4890}, {"loss": 1.4084, "grad_norm": 0.771186351776123, "learning_rate": 0.0002, "epoch": 4.112463281577843, "step": 4900}, {"loss": 1.2722, "grad_norm": 0.7756485342979431, "learning_rate": 0.0002, "epoch": 4.120856063785145, "step": 4910}, {"loss": 1.4138, "grad_norm": 0.7149116396903992, "learning_rate": 0.0002, "epoch": 4.129248845992446, "step": 4920}, {"loss": 1.3102, "grad_norm": 0.700442910194397, "learning_rate": 0.0002, "epoch": 4.137641628199749, "step": 4930}, {"loss": 1.3628, "grad_norm": 0.8439189195632935, "learning_rate": 0.0002, "epoch": 4.14603441040705, "step": 4940}, {"loss": 1.3511, "grad_norm": 0.6570779085159302, "learning_rate": 0.0002, "epoch": 4.154427192614351, "step": 4950}, {"loss": 1.3955, "grad_norm": 0.886482298374176, "learning_rate": 0.0002, "epoch": 4.1628199748216534, "step": 4960}, {"loss": 1.4083, "grad_norm": 0.7220938801765442, "learning_rate": 0.0002, "epoch": 4.171212757028955, "step": 4970}, {"loss": 1.3611, "grad_norm": 0.7185905575752258, "learning_rate": 0.0002, "epoch": 4.179605539236257, "step": 4980}, {"loss": 1.3623, "grad_norm": 0.7566333413124084, "learning_rate": 0.0002, "epoch": 4.187998321443558, "step": 4990}, {"loss": 1.2771, "grad_norm": 0.6960445642471313, "learning_rate": 0.0002, "epoch": 4.1963911036508605, "step": 5000}, {"loss": 1.3565, "grad_norm": 0.7727336883544922, "learning_rate": 0.0002, "epoch": 4.204783885858162, "step": 5010}, {"loss": 1.4156, "grad_norm": 0.8038365244865417, "learning_rate": 0.0002, "epoch": 4.213176668065464, "step": 5020}, {"loss": 1.3849, "grad_norm": 0.7587628364562988, "learning_rate": 0.0002, "epoch": 4.221569450272765, "step": 5030}, {"loss": 1.4047, "grad_norm": 0.928032398223877, "learning_rate": 0.0002, "epoch": 4.229962232480067, "step": 5040}, {"loss": 1.3768, "grad_norm": 0.7168642282485962, "learning_rate": 0.0002, "epoch": 4.238355014687369, "step": 5050}, {"loss": 1.3767, "grad_norm": 0.7981422543525696, "learning_rate": 0.0002, "epoch": 4.24674779689467, "step": 5060}, {"loss": 1.406, "grad_norm": 0.6951150894165039, "learning_rate": 0.0002, "epoch": 4.2551405791019725, "step": 5070}, {"loss": 1.3776, "grad_norm": 0.7337371706962585, "learning_rate": 0.0002, "epoch": 4.263533361309274, "step": 5080}, {"loss": 1.3425, "grad_norm": 0.8367464542388916, "learning_rate": 0.0002, "epoch": 4.271926143516576, "step": 5090}, {"loss": 1.3823, "grad_norm": 0.6744083166122437, "learning_rate": 0.0002, "epoch": 4.280318925723877, "step": 5100}, {"loss": 1.4183, "grad_norm": 0.9072301387786865, "learning_rate": 0.0002, "epoch": 4.28871170793118, "step": 5110}, {"loss": 1.4219, "grad_norm": 0.7703930735588074, "learning_rate": 0.0002, "epoch": 4.297104490138481, "step": 5120}, {"loss": 1.3658, "grad_norm": 0.6734083294868469, "learning_rate": 0.0002, "epoch": 4.305497272345782, "step": 5130}, {"loss": 1.441, "grad_norm": 0.7835540175437927, "learning_rate": 0.0002, "epoch": 4.3138900545530845, "step": 5140}, {"loss": 1.384, "grad_norm": 1.0822200775146484, "learning_rate": 0.0002, "epoch": 4.322282836760386, "step": 5150}, {"loss": 1.4167, "grad_norm": 0.8432536721229553, "learning_rate": 0.0002, "epoch": 4.330675618967688, "step": 5160}, {"loss": 1.3796, "grad_norm": 0.6739283800125122, "learning_rate": 0.0002, "epoch": 4.339068401174989, "step": 5170}, {"loss": 1.3651, "grad_norm": 0.7395278811454773, "learning_rate": 0.0002, "epoch": 4.347461183382292, "step": 5180}, {"loss": 1.3258, "grad_norm": 0.7638891339302063, "learning_rate": 0.0002, "epoch": 4.355853965589593, "step": 5190}, {"loss": 1.34, "grad_norm": 1.1222662925720215, "learning_rate": 0.0002, "epoch": 4.364246747796894, "step": 5200}, {"loss": 1.3757, "grad_norm": 0.9102525115013123, "learning_rate": 0.0002, "epoch": 4.3726395300041965, "step": 5210}, {"loss": 1.413, "grad_norm": 0.7181593775749207, "learning_rate": 0.0002, "epoch": 4.381032312211498, "step": 5220}, {"loss": 1.3808, "grad_norm": 0.7813979387283325, "learning_rate": 0.0002, "epoch": 4.3894250944188, "step": 5230}, {"loss": 1.423, "grad_norm": 0.8906185626983643, "learning_rate": 0.0002, "epoch": 4.397817876626101, "step": 5240}, {"loss": 1.3901, "grad_norm": 0.7456443309783936, "learning_rate": 0.0002, "epoch": 4.406210658833404, "step": 5250}, {"loss": 1.3292, "grad_norm": 0.8752070069313049, "learning_rate": 0.0002, "epoch": 4.414603441040705, "step": 5260}, {"loss": 1.3351, "grad_norm": 0.9560954570770264, "learning_rate": 0.0002, "epoch": 4.422996223248007, "step": 5270}, {"loss": 1.3708, "grad_norm": 0.7227762341499329, "learning_rate": 0.0002, "epoch": 4.4313890054553084, "step": 5280}, {"loss": 1.4281, "grad_norm": 0.8141599893569946, "learning_rate": 0.0002, "epoch": 4.43978178766261, "step": 5290}, {"loss": 1.381, "grad_norm": 0.928382158279419, "learning_rate": 0.0002, "epoch": 4.448174569869912, "step": 5300}, {"loss": 1.3586, "grad_norm": 0.7719997763633728, "learning_rate": 0.0002, "epoch": 4.456567352077213, "step": 5310}, {"loss": 1.3652, "grad_norm": 0.8081879615783691, "learning_rate": 0.0002, "epoch": 4.4649601342845155, "step": 5320}, {"loss": 1.4121, "grad_norm": 0.7903412580490112, "learning_rate": 0.0002, "epoch": 4.473352916491817, "step": 5330}, {"loss": 1.4453, "grad_norm": 0.7751287221908569, "learning_rate": 0.0002, "epoch": 4.481745698699119, "step": 5340}, {"loss": 1.392, "grad_norm": 0.8287544250488281, "learning_rate": 0.0002, "epoch": 4.49013848090642, "step": 5350}, {"loss": 1.3841, "grad_norm": 0.7431012392044067, "learning_rate": 0.0002, "epoch": 4.498531263113723, "step": 5360}, {"loss": 1.3843, "grad_norm": 0.8648661971092224, "learning_rate": 0.0002, "epoch": 4.506924045321024, "step": 5370}, {"loss": 1.3742, "grad_norm": 0.9314997792243958, "learning_rate": 0.0002, "epoch": 4.515316827528325, "step": 5380}, {"loss": 1.354, "grad_norm": 0.7530864477157593, "learning_rate": 0.0002, "epoch": 4.5237096097356275, "step": 5390}, {"loss": 1.4159, "grad_norm": 0.8739821910858154, "learning_rate": 0.0002, "epoch": 4.532102391942929, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.8090344667434692, "learning_rate": 0.0002, "epoch": 4.540495174150231, "step": 5410}, {"loss": 1.4187, "grad_norm": 0.7530879974365234, "learning_rate": 0.0002, "epoch": 4.548887956357532, "step": 5420}, {"loss": 1.47, "grad_norm": 0.8787251114845276, "learning_rate": 0.0002, "epoch": 4.557280738564835, "step": 5430}, {"loss": 1.375, "grad_norm": 0.813961923122406, "learning_rate": 0.0002, "epoch": 4.565673520772136, "step": 5440}, {"loss": 1.4475, "grad_norm": 0.7778232097625732, "learning_rate": 0.0002, "epoch": 4.574066302979437, "step": 5450}, {"loss": 1.4421, "grad_norm": 0.7323020696640015, "learning_rate": 0.0002, "epoch": 4.5824590851867395, "step": 5460}, {"loss": 1.396, "grad_norm": 0.7826765179634094, "learning_rate": 0.0002, "epoch": 4.590851867394041, "step": 5470}, {"loss": 1.4068, "grad_norm": 0.7245969772338867, "learning_rate": 0.0002, "epoch": 4.599244649601343, "step": 5480}, {"loss": 1.4276, "grad_norm": 0.7697308659553528, "learning_rate": 0.0002, "epoch": 4.607637431808644, "step": 5490}, {"loss": 1.3849, "grad_norm": 0.8053571581840515, "learning_rate": 0.0002, "epoch": 4.616030214015947, "step": 5500}, {"loss": 1.4225, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.624422996223248, "step": 5510}, {"loss": 1.3771, "grad_norm": 0.7398585677146912, "learning_rate": 0.0002, "epoch": 4.632815778430549, "step": 5520}, {"loss": 1.4216, "grad_norm": 0.7896319031715393, "learning_rate": 0.0002, "epoch": 4.6412085606378515, "step": 5530}, {"loss": 1.4199, "grad_norm": 0.8290980458259583, "learning_rate": 0.0002, "epoch": 4.649601342845153, "step": 5540}, {"loss": 1.463, "grad_norm": 0.8232647776603699, "learning_rate": 0.0002, "epoch": 4.657994125052455, "step": 5550}, {"loss": 1.3925, "grad_norm": 0.9154987335205078, "learning_rate": 0.0002, "epoch": 4.666386907259756, "step": 5560}, {"loss": 1.3674, "grad_norm": 0.8400886654853821, "learning_rate": 0.0002, "epoch": 4.674779689467059, "step": 5570}, {"loss": 1.379, "grad_norm": 0.7312718629837036, "learning_rate": 0.0002, "epoch": 4.68317247167436, "step": 5580}, {"loss": 1.3925, "grad_norm": 0.8043803572654724, "learning_rate": 0.0002, "epoch": 4.691565253881662, "step": 5590}, {"loss": 1.3952, "grad_norm": 0.7966225147247314, "learning_rate": 0.0002, "epoch": 4.6999580360889635, "step": 5600}, {"loss": 1.3429, "grad_norm": 0.881574809551239, "learning_rate": 0.0002, "epoch": 4.708350818296266, "step": 5610}, {"loss": 1.4444, "grad_norm": 0.7252084016799927, "learning_rate": 0.0002, "epoch": 4.716743600503567, "step": 5620}, {"loss": 1.3566, "grad_norm": 0.7726518511772156, "learning_rate": 0.0002, "epoch": 4.725136382710868, "step": 5630}, {"loss": 1.3954, "grad_norm": 0.7306379079818726, "learning_rate": 0.0002, "epoch": 4.7335291649181706, "step": 5640}, {"loss": 1.4385, "grad_norm": 0.8029969334602356, "learning_rate": 0.0002, "epoch": 4.741921947125472, "step": 5650}, {"loss": 1.3966, "grad_norm": 0.9103893637657166, "learning_rate": 0.0002, "epoch": 4.750314729332774, "step": 5660}, {"loss": 1.4026, "grad_norm": 0.8783416748046875, "learning_rate": 0.0002, "epoch": 4.758707511540075, "step": 5670}, {"loss": 1.3427, "grad_norm": 0.6807119846343994, "learning_rate": 0.0002, "epoch": 4.767100293747378, "step": 5680}, {"loss": 1.4148, "grad_norm": 0.7103772759437561, "learning_rate": 0.0002, "epoch": 4.775493075954679, "step": 5690}, {"loss": 1.4079, "grad_norm": 0.8472093343734741, "learning_rate": 0.0002, "epoch": 4.78388585816198, "step": 5700}, {"loss": 1.3937, "grad_norm": 0.851847231388092, "learning_rate": 0.0002, "epoch": 4.7922786403692825, "step": 5710}, {"loss": 1.3965, "grad_norm": 0.9084636569023132, "learning_rate": 0.0002, "epoch": 4.800671422576584, "step": 5720}, {"loss": 1.4358, "grad_norm": 0.7628585696220398, "learning_rate": 0.0002, "epoch": 4.809064204783886, "step": 5730}, {"loss": 1.3746, "grad_norm": 0.775580883026123, "learning_rate": 0.0002, "epoch": 4.817456986991187, "step": 5740}, {"loss": 1.4573, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 4.82584976919849, "step": 5750}, {"loss": 1.3991, "grad_norm": 0.7021728754043579, "learning_rate": 0.0002, "epoch": 4.834242551405791, "step": 5760}, {"loss": 1.4012, "grad_norm": 0.7810541391372681, "learning_rate": 0.0002, "epoch": 4.842635333613092, "step": 5770}, {"loss": 1.396, "grad_norm": 0.7290041446685791, "learning_rate": 0.0002, "epoch": 4.8510281158203945, "step": 5780}, {"loss": 1.4769, "grad_norm": 0.9059709906578064, "learning_rate": 0.0002, "epoch": 4.859420898027696, "step": 5790}, {"loss": 1.4091, "grad_norm": 0.8338062167167664, "learning_rate": 0.0002, "epoch": 4.867813680234998, "step": 5800}, {"loss": 1.395, "grad_norm": 0.830926775932312, "learning_rate": 0.0002, "epoch": 4.876206462442299, "step": 5810}, {"loss": 1.4261, "grad_norm": 0.7818633317947388, "learning_rate": 0.0002, "epoch": 4.884599244649602, "step": 5820}, {"loss": 1.4252, "grad_norm": 0.8143376708030701, "learning_rate": 0.0002, "epoch": 4.892992026856903, "step": 5830}, {"loss": 1.3583, "grad_norm": 0.7754496335983276, "learning_rate": 0.0002, "epoch": 4.901384809064205, "step": 5840}, {"loss": 1.4036, "grad_norm": 0.7154468297958374, "learning_rate": 0.0002, "epoch": 4.9097775912715065, "step": 5850}, {"loss": 1.3909, "grad_norm": 0.6829783916473389, "learning_rate": 0.0002, "epoch": 4.918170373478809, "step": 5860}, {"loss": 1.3854, "grad_norm": 0.784919261932373, "learning_rate": 0.0002, "epoch": 4.92656315568611, "step": 5870}, {"loss": 1.4277, "grad_norm": 0.8168354034423828, "learning_rate": 0.0002, "epoch": 4.934955937893411, "step": 5880}, {"loss": 1.3694, "grad_norm": 0.7356618642807007, "learning_rate": 0.0002, "epoch": 4.943348720100714, "step": 5890}, {"loss": 1.4827, "grad_norm": 0.7399224042892456, "learning_rate": 0.0002, "epoch": 4.951741502308015, "step": 5900}, {"loss": 1.3643, "grad_norm": 0.7430436015129089, "learning_rate": 0.0002, "epoch": 4.960134284515317, "step": 5910}, {"loss": 1.3836, "grad_norm": 0.7587705850601196, "learning_rate": 0.0002, "epoch": 4.9685270667226185, "step": 5920}, {"loss": 1.4162, "grad_norm": 0.9103638529777527, "learning_rate": 0.0002, "epoch": 4.976919848929921, "step": 5930}, {"loss": 1.4688, "grad_norm": 0.7357394695281982, "learning_rate": 0.0002, "epoch": 4.985312631137222, "step": 5940}, {"loss": 1.3988, "grad_norm": 0.7371547222137451, "learning_rate": 0.0002, "epoch": 4.993705413344523, "step": 5950}, {"eval_loss": 1.9367210865020752, "eval_runtime": 37.9833, "eval_samples_per_second": 13.559, "eval_steps_per_second": 1.711, "epoch": 4.9995803608896345, "step": 5957}, {"loss": 1.3876, "grad_norm": 0.7783351540565491, "learning_rate": 0.0002, "epoch": 5.0020981955518256, "step": 5960}, {"loss": 1.2387, "grad_norm": 0.9268898367881775, "learning_rate": 0.0002, "epoch": 5.010490977759127, "step": 5970}, {"loss": 1.2621, "grad_norm": 0.9562761783599854, "learning_rate": 0.0002, "epoch": 5.018883759966429, "step": 5980}, {"loss": 1.205, "grad_norm": 0.9391738176345825, "learning_rate": 0.0002, "epoch": 5.02727654217373, "step": 5990}, {"loss": 1.2112, "grad_norm": 0.850326418876648, "learning_rate": 0.0002, "epoch": 5.035669324381033, "step": 6000}, {"loss": 1.2285, "grad_norm": 0.8442679643630981, "learning_rate": 0.0002, "epoch": 5.044062106588334, "step": 6010}, {"loss": 1.1677, "grad_norm": 1.2147290706634521, "learning_rate": 0.0002, "epoch": 5.052454888795635, "step": 6020}, {"loss": 1.1836, "grad_norm": 0.9732922315597534, "learning_rate": 0.0002, "epoch": 5.0608476710029375, "step": 6030}, {"loss": 1.215, "grad_norm": 0.9354516267776489, "learning_rate": 0.0002, "epoch": 5.069240453210239, "step": 6040}, {"loss": 1.1918, "grad_norm": 0.9681560397148132, "learning_rate": 0.0002, "epoch": 5.077633235417541, "step": 6050}, {"loss": 1.2146, "grad_norm": 0.9500439763069153, "learning_rate": 0.0002, "epoch": 5.086026017624842, "step": 6060}, {"loss": 1.1475, "grad_norm": 0.8693879246711731, "learning_rate": 0.0002, "epoch": 5.094418799832145, "step": 6070}, {"loss": 1.2181, "grad_norm": 1.1066458225250244, "learning_rate": 0.0002, "epoch": 5.102811582039446, "step": 6080}, {"loss": 1.2135, "grad_norm": 0.9530285000801086, "learning_rate": 0.0002, "epoch": 5.111204364246748, "step": 6090}, {"loss": 1.2388, "grad_norm": 0.9323630928993225, "learning_rate": 0.0002, "epoch": 5.1195971464540495, "step": 6100}, {"loss": 1.2434, "grad_norm": 0.9040294885635376, "learning_rate": 0.0002, "epoch": 5.127989928661351, "step": 6110}, {"loss": 1.2502, "grad_norm": 0.9981122612953186, "learning_rate": 0.0002, "epoch": 5.136382710868653, "step": 6120}, {"loss": 1.2648, "grad_norm": 0.9070921540260315, "learning_rate": 0.0002, "epoch": 5.144775493075954, "step": 6130}, {"loss": 1.2802, "grad_norm": 1.043802261352539, "learning_rate": 0.0002, "epoch": 5.153168275283257, "step": 6140}, {"loss": 1.1865, "grad_norm": 1.0889761447906494, "learning_rate": 0.0002, "epoch": 5.161561057490558, "step": 6150}, {"loss": 1.2498, "grad_norm": 0.9908999800682068, "learning_rate": 0.0002, "epoch": 5.16995383969786, "step": 6160}, {"loss": 1.2981, "grad_norm": 1.099233865737915, "learning_rate": 0.0002, "epoch": 5.1783466219051615, "step": 6170}, {"loss": 1.2236, "grad_norm": 0.9536478519439697, "learning_rate": 0.0002, "epoch": 5.186739404112464, "step": 6180}, {"loss": 1.1889, "grad_norm": 0.8672952055931091, "learning_rate": 0.0002, "epoch": 5.195132186319765, "step": 6190}, {"loss": 1.2142, "grad_norm": 1.0116329193115234, "learning_rate": 0.0002, "epoch": 5.203524968527066, "step": 6200}, {"loss": 1.1813, "grad_norm": 0.9327153563499451, "learning_rate": 0.0002, "epoch": 5.211917750734369, "step": 6210}, {"loss": 1.2372, "grad_norm": 0.85637366771698, "learning_rate": 0.0002, "epoch": 5.22031053294167, "step": 6220}, {"loss": 1.2949, "grad_norm": 1.0490736961364746, "learning_rate": 0.0002, "epoch": 5.228703315148972, "step": 6230}, {"loss": 1.1604, "grad_norm": 0.8849565982818604, "learning_rate": 0.0002, "epoch": 5.2370960973562735, "step": 6240}, {"loss": 1.2257, "grad_norm": 0.8852671980857849, "learning_rate": 0.0002, "epoch": 5.245488879563576, "step": 6250}, {"loss": 1.275, "grad_norm": 0.9146860241889954, "learning_rate": 0.0002, "epoch": 5.253881661770877, "step": 6260}, {"loss": 1.2543, "grad_norm": 1.0188325643539429, "learning_rate": 0.0002, "epoch": 5.262274443978178, "step": 6270}, {"loss": 1.1703, "grad_norm": 1.0053156614303589, "learning_rate": 0.0002, "epoch": 5.270667226185481, "step": 6280}, {"loss": 1.2594, "grad_norm": 0.9962273836135864, "learning_rate": 0.0002, "epoch": 5.279060008392782, "step": 6290}, {"loss": 1.2487, "grad_norm": 1.000300645828247, "learning_rate": 0.0002, "epoch": 5.287452790600084, "step": 6300}, {"loss": 1.3214, "grad_norm": 0.9821932911872864, "learning_rate": 0.0002, "epoch": 5.295845572807385, "step": 6310}, {"loss": 1.2964, "grad_norm": 1.0103896856307983, "learning_rate": 0.0002, "epoch": 5.304238355014688, "step": 6320}, {"loss": 1.2497, "grad_norm": 0.9323601722717285, "learning_rate": 0.0002, "epoch": 5.312631137221989, "step": 6330}, {"loss": 1.3165, "grad_norm": 1.0668879747390747, "learning_rate": 0.0002, "epoch": 5.321023919429291, "step": 6340}, {"loss": 1.2411, "grad_norm": 0.9666323065757751, "learning_rate": 0.0002, "epoch": 5.3294167016365925, "step": 6350}, {"loss": 1.2129, "grad_norm": 0.9439574480056763, "learning_rate": 0.0002, "epoch": 5.337809483843894, "step": 6360}, {"loss": 1.2355, "grad_norm": 1.0229361057281494, "learning_rate": 0.0002, "epoch": 5.346202266051196, "step": 6370}, {"loss": 1.2021, "grad_norm": 0.8522404432296753, "learning_rate": 0.0002, "epoch": 5.354595048258497, "step": 6380}, {"loss": 1.32, "grad_norm": 1.3732287883758545, "learning_rate": 0.0002, "epoch": 5.3629878304658, "step": 6390}, {"loss": 1.1987, "grad_norm": 0.8201091885566711, "learning_rate": 0.0002, "epoch": 5.371380612673101, "step": 6400}, {"loss": 1.2867, "grad_norm": 0.8874436616897583, "learning_rate": 0.0002, "epoch": 5.379773394880403, "step": 6410}, {"loss": 1.2686, "grad_norm": 1.0118640661239624, "learning_rate": 0.0002, "epoch": 5.3881661770877045, "step": 6420}, {"loss": 1.2952, "grad_norm": 1.0468370914459229, "learning_rate": 0.0002, "epoch": 5.396558959295007, "step": 6430}, {"loss": 1.2057, "grad_norm": 0.941806972026825, "learning_rate": 0.0002, "epoch": 5.404951741502308, "step": 6440}, {"loss": 1.3289, "grad_norm": 0.9860424399375916, "learning_rate": 0.0002, "epoch": 5.413344523709609, "step": 6450}, {"loss": 1.2887, "grad_norm": 1.009628176689148, "learning_rate": 0.0002, "epoch": 5.421737305916912, "step": 6460}, {"loss": 1.2544, "grad_norm": 0.9842159748077393, "learning_rate": 0.0002, "epoch": 5.430130088124213, "step": 6470}, {"loss": 1.2277, "grad_norm": 0.9935571551322937, "learning_rate": 0.0002, "epoch": 5.438522870331515, "step": 6480}, {"loss": 1.2392, "grad_norm": 0.8872362971305847, "learning_rate": 0.0002, "epoch": 5.4469156525388165, "step": 6490}, {"loss": 1.2166, "grad_norm": 0.9530836939811707, "learning_rate": 0.0002, "epoch": 5.455308434746119, "step": 6500}, {"loss": 1.2138, "grad_norm": 0.8111279010772705, "learning_rate": 0.0002, "epoch": 5.46370121695342, "step": 6510}, {"loss": 1.2375, "grad_norm": 1.0474516153335571, "learning_rate": 0.0002, "epoch": 5.472093999160721, "step": 6520}, {"loss": 1.2752, "grad_norm": 1.0228482484817505, "learning_rate": 0.0002, "epoch": 5.480486781368024, "step": 6530}, {"loss": 1.2739, "grad_norm": 1.0299347639083862, "learning_rate": 0.0002, "epoch": 5.488879563575325, "step": 6540}, {"loss": 1.3163, "grad_norm": 0.9105098247528076, "learning_rate": 0.0002, "epoch": 5.497272345782627, "step": 6550}, {"loss": 1.2718, "grad_norm": 1.2459523677825928, "learning_rate": 0.0002, "epoch": 5.5056651279899285, "step": 6560}, {"loss": 1.2697, "grad_norm": 1.0630481243133545, "learning_rate": 0.0002, "epoch": 5.514057910197231, "step": 6570}, {"loss": 1.3003, "grad_norm": 0.8310980796813965, "learning_rate": 0.0002, "epoch": 5.522450692404532, "step": 6580}, {"loss": 1.1855, "grad_norm": 1.102723479270935, "learning_rate": 0.0002, "epoch": 5.530843474611833, "step": 6590}, {"loss": 1.2889, "grad_norm": 0.9586807489395142, "learning_rate": 0.0002, "epoch": 5.539236256819136, "step": 6600}, {"loss": 1.2899, "grad_norm": 0.976191520690918, "learning_rate": 0.0002, "epoch": 5.547629039026437, "step": 6610}, {"loss": 1.2319, "grad_norm": 0.9943762421607971, "learning_rate": 0.0002, "epoch": 5.556021821233739, "step": 6620}, {"loss": 1.3103, "grad_norm": 0.8788089156150818, "learning_rate": 0.0002, "epoch": 5.56441460344104, "step": 6630}, {"loss": 1.1982, "grad_norm": 0.9866173267364502, "learning_rate": 0.0002, "epoch": 5.572807385648343, "step": 6640}, {"loss": 1.2686, "grad_norm": 1.0791642665863037, "learning_rate": 0.0002, "epoch": 5.581200167855644, "step": 6650}, {"loss": 1.2806, "grad_norm": 0.836482584476471, "learning_rate": 0.0002, "epoch": 5.589592950062946, "step": 6660}, {"loss": 1.3114, "grad_norm": 0.9841130971908569, "learning_rate": 0.0002, "epoch": 5.5979857322702475, "step": 6670}, {"loss": 1.2323, "grad_norm": 0.9678813815116882, "learning_rate": 0.0002, "epoch": 5.60637851447755, "step": 6680}, {"loss": 1.1969, "grad_norm": 0.9033233523368835, "learning_rate": 0.0002, "epoch": 5.614771296684851, "step": 6690}, {"loss": 1.2565, "grad_norm": 0.8691515922546387, "learning_rate": 0.0002, "epoch": 5.623164078892152, "step": 6700}, {"loss": 1.2678, "grad_norm": 0.8971360921859741, "learning_rate": 0.0002, "epoch": 5.631556861099455, "step": 6710}, {"loss": 1.2266, "grad_norm": 0.9377756118774414, "learning_rate": 0.0002, "epoch": 5.639949643306756, "step": 6720}, {"loss": 1.28, "grad_norm": 0.908762514591217, "learning_rate": 0.0002, "epoch": 5.648342425514058, "step": 6730}, {"loss": 1.2499, "grad_norm": 1.0503337383270264, "learning_rate": 0.0002, "epoch": 5.6567352077213595, "step": 6740}, {"loss": 1.3604, "grad_norm": 1.030267357826233, "learning_rate": 0.0002, "epoch": 5.665127989928662, "step": 6750}, {"loss": 1.2223, "grad_norm": 0.9150485992431641, "learning_rate": 0.0002, "epoch": 5.673520772135963, "step": 6760}, {"loss": 1.2651, "grad_norm": 1.0300343036651611, "learning_rate": 0.0002, "epoch": 5.681913554343264, "step": 6770}, {"loss": 1.2506, "grad_norm": 1.1242924928665161, "learning_rate": 0.0002, "epoch": 5.690306336550567, "step": 6780}, {"loss": 1.3318, "grad_norm": 0.9489498138427734, "learning_rate": 0.0002, "epoch": 5.698699118757868, "step": 6790}, {"loss": 1.2578, "grad_norm": 0.8829707503318787, "learning_rate": 0.0002, "epoch": 5.70709190096517, "step": 6800}, {"loss": 1.2765, "grad_norm": 1.01392662525177, "learning_rate": 0.0002, "epoch": 5.7154846831724715, "step": 6810}, {"loss": 1.3029, "grad_norm": 0.9234510064125061, "learning_rate": 0.0002, "epoch": 5.723877465379774, "step": 6820}, {"loss": 1.2891, "grad_norm": 0.9439187049865723, "learning_rate": 0.0002, "epoch": 5.732270247587075, "step": 6830}, {"loss": 1.2627, "grad_norm": 0.8833441734313965, "learning_rate": 0.0002, "epoch": 5.740663029794376, "step": 6840}, {"loss": 1.3195, "grad_norm": 0.9394439458847046, "learning_rate": 0.0002, "epoch": 5.749055812001679, "step": 6850}, {"loss": 1.3108, "grad_norm": 0.9980010390281677, "learning_rate": 0.0002, "epoch": 5.75744859420898, "step": 6860}, {"loss": 1.2958, "grad_norm": 0.9612377882003784, "learning_rate": 0.0002, "epoch": 5.765841376416282, "step": 6870}, {"loss": 1.2173, "grad_norm": 1.0817323923110962, "learning_rate": 0.0002, "epoch": 5.7742341586235835, "step": 6880}, {"loss": 1.2485, "grad_norm": 0.8445103168487549, "learning_rate": 0.0002, "epoch": 5.782626940830886, "step": 6890}, {"loss": 1.2573, "grad_norm": 0.8535459041595459, "learning_rate": 0.0002, "epoch": 5.791019723038187, "step": 6900}, {"loss": 1.2729, "grad_norm": 0.9131284356117249, "learning_rate": 0.0002, "epoch": 5.799412505245489, "step": 6910}, {"loss": 1.1934, "grad_norm": 0.8627726435661316, "learning_rate": 0.0002, "epoch": 5.807805287452791, "step": 6920}, {"loss": 1.3226, "grad_norm": 0.8599951863288879, "learning_rate": 0.0002, "epoch": 5.816198069660093, "step": 6930}, {"loss": 1.3078, "grad_norm": 1.0746861696243286, "learning_rate": 0.0002, "epoch": 5.824590851867394, "step": 6940}, {"loss": 1.2653, "grad_norm": 1.0220543146133423, "learning_rate": 0.0002, "epoch": 5.8329836340746954, "step": 6950}, {"loss": 1.3168, "grad_norm": 0.8891388177871704, "learning_rate": 0.0002, "epoch": 5.841376416281998, "step": 6960}, {"loss": 1.2845, "grad_norm": 1.1404683589935303, "learning_rate": 0.0002, "epoch": 5.849769198489299, "step": 6970}, {"loss": 1.2361, "grad_norm": 0.9665380120277405, "learning_rate": 0.0002, "epoch": 5.858161980696601, "step": 6980}, {"loss": 1.2622, "grad_norm": 0.9837968945503235, "learning_rate": 0.0002, "epoch": 5.8665547629039025, "step": 6990}, {"loss": 1.2973, "grad_norm": 1.0278598070144653, "learning_rate": 0.0002, "epoch": 5.874947545111205, "step": 7000}, {"loss": 1.2334, "grad_norm": 0.9990253448486328, "learning_rate": 0.0002, "epoch": 5.883340327318506, "step": 7010}, {"loss": 1.3508, "grad_norm": 0.9705647230148315, "learning_rate": 0.0002, "epoch": 5.891733109525807, "step": 7020}, {"loss": 1.335, "grad_norm": 0.9672252535820007, "learning_rate": 0.0002, "epoch": 5.90012589173311, "step": 7030}, {"loss": 1.2944, "grad_norm": 0.9467034339904785, "learning_rate": 0.0002, "epoch": 5.908518673940411, "step": 7040}, {"loss": 1.2704, "grad_norm": 0.9506469964981079, "learning_rate": 0.0002, "epoch": 5.916911456147713, "step": 7050}, {"loss": 1.2745, "grad_norm": 0.8936163783073425, "learning_rate": 0.0002, "epoch": 5.9253042383550145, "step": 7060}, {"loss": 1.2702, "grad_norm": 0.956101655960083, "learning_rate": 0.0002, "epoch": 5.933697020562317, "step": 7070}, {"loss": 1.2532, "grad_norm": 0.893535852432251, "learning_rate": 0.0002, "epoch": 5.942089802769618, "step": 7080}, {"loss": 1.342, "grad_norm": 1.0313799381256104, "learning_rate": 0.0002, "epoch": 5.950482584976919, "step": 7090}, {"loss": 1.3398, "grad_norm": 0.8567915558815002, "learning_rate": 0.0002, "epoch": 5.958875367184222, "step": 7100}, {"loss": 1.3127, "grad_norm": 0.9683501720428467, "learning_rate": 0.0002, "epoch": 5.967268149391523, "step": 7110}, {"loss": 1.2522, "grad_norm": 0.9401984214782715, "learning_rate": 0.0002, "epoch": 5.975660931598825, "step": 7120}, {"loss": 1.3211, "grad_norm": 1.0316764116287231, "learning_rate": 0.0002, "epoch": 5.9840537138061265, "step": 7130}, {"loss": 1.2445, "grad_norm": 0.9335392713546753, "learning_rate": 0.0002, "epoch": 5.992446496013429, "step": 7140}, {"eval_loss": 2.041194438934326, "eval_runtime": 37.9642, "eval_samples_per_second": 13.565, "eval_steps_per_second": 1.712, "epoch": 6.0, "step": 7149}, {"loss": 1.2531, "grad_norm": 1.0247591733932495, "learning_rate": 0.0002, "epoch": 6.00083927822073, "step": 7150}, {"loss": 1.1125, "grad_norm": 1.4086190462112427, "learning_rate": 0.0002, "epoch": 6.009232060428032, "step": 7160}, {"loss": 1.0702, "grad_norm": 1.0636897087097168, "learning_rate": 0.0002, "epoch": 6.017624842635334, "step": 7170}, {"loss": 1.118, "grad_norm": 1.1334257125854492, "learning_rate": 0.0002, "epoch": 6.026017624842635, "step": 7180}, {"loss": 1.0428, "grad_norm": 1.1142425537109375, "learning_rate": 0.0002, "epoch": 6.034410407049937, "step": 7190}, {"loss": 1.0439, "grad_norm": 1.1448479890823364, "learning_rate": 0.0002, "epoch": 6.0428031892572385, "step": 7200}, {"loss": 1.0364, "grad_norm": 1.181567907333374, "learning_rate": 0.0002, "epoch": 6.051195971464541, "step": 7210}, {"loss": 1.0435, "grad_norm": 1.0471529960632324, "learning_rate": 0.0002, "epoch": 6.059588753671842, "step": 7220}, {"loss": 1.0828, "grad_norm": 1.1432698965072632, "learning_rate": 0.0002, "epoch": 6.067981535879144, "step": 7230}, {"loss": 1.095, "grad_norm": 1.1316763162612915, "learning_rate": 0.0002, "epoch": 6.076374318086446, "step": 7240}, {"loss": 1.0767, "grad_norm": 0.9800271391868591, "learning_rate": 0.0002, "epoch": 6.084767100293748, "step": 7250}, {"loss": 1.0984, "grad_norm": 1.1878576278686523, "learning_rate": 0.0002, "epoch": 6.093159882501049, "step": 7260}, {"loss": 1.1225, "grad_norm": 1.0174267292022705, "learning_rate": 0.0002, "epoch": 6.1015526647083504, "step": 7270}, {"loss": 1.0747, "grad_norm": 0.9622059464454651, "learning_rate": 0.0002, "epoch": 6.109945446915653, "step": 7280}, {"loss": 1.1606, "grad_norm": 1.3247325420379639, "learning_rate": 0.0002, "epoch": 6.118338229122954, "step": 7290}, {"loss": 1.0533, "grad_norm": 1.2405189275741577, "learning_rate": 0.0002, "epoch": 6.126731011330256, "step": 7300}, {"loss": 1.1345, "grad_norm": 1.025123953819275, "learning_rate": 0.0002, "epoch": 6.1351237935375575, "step": 7310}, {"loss": 1.0879, "grad_norm": 1.2966125011444092, "learning_rate": 0.0002, "epoch": 6.14351657574486, "step": 7320}, {"loss": 1.106, "grad_norm": 1.0655252933502197, "learning_rate": 0.0002, "epoch": 6.151909357952161, "step": 7330}, {"loss": 1.1089, "grad_norm": 1.076251745223999, "learning_rate": 0.0002, "epoch": 6.160302140159462, "step": 7340}, {"loss": 1.1144, "grad_norm": 1.0632140636444092, "learning_rate": 0.0002, "epoch": 6.168694922366765, "step": 7350}, {"loss": 1.1284, "grad_norm": 1.392654538154602, "learning_rate": 0.0002, "epoch": 6.177087704574066, "step": 7360}, {"loss": 1.0909, "grad_norm": 1.071683645248413, "learning_rate": 0.0002, "epoch": 6.185480486781368, "step": 7370}, {"loss": 1.1041, "grad_norm": 1.0602295398712158, "learning_rate": 0.0002, "epoch": 6.1938732689886695, "step": 7380}, {"loss": 1.083, "grad_norm": 1.2152365446090698, "learning_rate": 0.0002, "epoch": 6.202266051195972, "step": 7390}, {"loss": 1.0622, "grad_norm": 1.1637049913406372, "learning_rate": 0.0002, "epoch": 6.210658833403273, "step": 7400}, {"loss": 1.1107, "grad_norm": 1.3976062536239624, "learning_rate": 0.0002, "epoch": 6.219051615610575, "step": 7410}, {"loss": 1.084, "grad_norm": 1.1892462968826294, "learning_rate": 0.0002, "epoch": 6.227444397817877, "step": 7420}, {"loss": 1.0517, "grad_norm": 1.23629629611969, "learning_rate": 0.0002, "epoch": 6.235837180025178, "step": 7430}, {"loss": 1.1069, "grad_norm": 1.2072324752807617, "learning_rate": 0.0002, "epoch": 6.24422996223248, "step": 7440}, {"loss": 1.172, "grad_norm": 1.2027140855789185, "learning_rate": 0.0002, "epoch": 6.2526227444397815, "step": 7450}, {"loss": 1.0373, "grad_norm": 1.2129466533660889, "learning_rate": 0.0002, "epoch": 6.261015526647084, "step": 7460}, {"loss": 1.1493, "grad_norm": 1.1675773859024048, "learning_rate": 0.0002, "epoch": 6.269408308854385, "step": 7470}, {"loss": 1.0884, "grad_norm": 1.189106822013855, "learning_rate": 0.0002, "epoch": 6.277801091061687, "step": 7480}, {"loss": 1.1557, "grad_norm": 0.9968156218528748, "learning_rate": 0.0002, "epoch": 6.286193873268989, "step": 7490}, {"loss": 1.1816, "grad_norm": 1.2140403985977173, "learning_rate": 0.0002, "epoch": 6.294586655476291, "step": 7500}, {"loss": 1.1163, "grad_norm": 1.1790717840194702, "learning_rate": 0.0002, "epoch": 6.302979437683592, "step": 7510}, {"loss": 1.114, "grad_norm": 1.1867438554763794, "learning_rate": 0.0002, "epoch": 6.3113722198908935, "step": 7520}, {"loss": 1.1697, "grad_norm": 1.2212399244308472, "learning_rate": 0.0002, "epoch": 6.319765002098196, "step": 7530}, {"loss": 1.1103, "grad_norm": 1.1840152740478516, "learning_rate": 0.0002, "epoch": 6.328157784305497, "step": 7540}, {"loss": 1.015, "grad_norm": 1.1392520666122437, "learning_rate": 0.0002, "epoch": 6.336550566512799, "step": 7550}, {"loss": 1.1686, "grad_norm": 1.2683428525924683, "learning_rate": 0.0002, "epoch": 6.344943348720101, "step": 7560}, {"loss": 1.1221, "grad_norm": 1.2927075624465942, "learning_rate": 0.0002, "epoch": 6.353336130927403, "step": 7570}, {"loss": 1.1728, "grad_norm": 1.1633557081222534, "learning_rate": 0.0002, "epoch": 6.361728913134704, "step": 7580}, {"loss": 1.0448, "grad_norm": 1.2839789390563965, "learning_rate": 0.0002, "epoch": 6.3701216953420055, "step": 7590}, {"loss": 1.0679, "grad_norm": 1.1563365459442139, "learning_rate": 0.0002, "epoch": 6.378514477549308, "step": 7600}, {"loss": 1.1222, "grad_norm": 1.3075823783874512, "learning_rate": 0.0002, "epoch": 6.386907259756609, "step": 7610}, {"loss": 1.1872, "grad_norm": 1.1148593425750732, "learning_rate": 0.0002, "epoch": 6.395300041963911, "step": 7620}, {"loss": 1.1296, "grad_norm": 1.3017758131027222, "learning_rate": 0.0002, "epoch": 6.4036928241712125, "step": 7630}, {"loss": 1.0982, "grad_norm": 1.3302847146987915, "learning_rate": 0.0002, "epoch": 6.412085606378515, "step": 7640}, {"loss": 1.1228, "grad_norm": 1.3263767957687378, "learning_rate": 0.0002, "epoch": 6.420478388585816, "step": 7650}, {"loss": 1.1036, "grad_norm": 1.2079416513442993, "learning_rate": 0.0002, "epoch": 6.428871170793118, "step": 7660}, {"loss": 1.0885, "grad_norm": 1.1282644271850586, "learning_rate": 0.0002, "epoch": 6.43726395300042, "step": 7670}, {"loss": 1.1437, "grad_norm": 1.1894482374191284, "learning_rate": 0.0002, "epoch": 6.445656735207721, "step": 7680}, {"loss": 1.1531, "grad_norm": 1.2007642984390259, "learning_rate": 0.0002, "epoch": 6.454049517415023, "step": 7690}, {"loss": 1.1639, "grad_norm": 1.3172780275344849, "learning_rate": 0.0002, "epoch": 6.4624422996223245, "step": 7700}, {"loss": 1.1477, "grad_norm": 1.113945722579956, "learning_rate": 0.0002, "epoch": 6.470835081829627, "step": 7710}, {"loss": 1.0852, "grad_norm": 1.1763832569122314, "learning_rate": 0.0002, "epoch": 6.479227864036928, "step": 7720}, {"loss": 1.1121, "grad_norm": 1.196928858757019, "learning_rate": 0.0002, "epoch": 6.48762064624423, "step": 7730}, {"loss": 1.1736, "grad_norm": 1.2109456062316895, "learning_rate": 0.0002, "epoch": 6.496013428451532, "step": 7740}, {"loss": 1.1575, "grad_norm": 1.3580254316329956, "learning_rate": 0.0002, "epoch": 6.504406210658834, "step": 7750}, {"loss": 1.0606, "grad_norm": 1.0432099103927612, "learning_rate": 0.0002, "epoch": 6.512798992866135, "step": 7760}, {"loss": 1.1453, "grad_norm": 1.0125840902328491, "learning_rate": 0.0002, "epoch": 6.5211917750734365, "step": 7770}, {"loss": 1.1112, "grad_norm": 1.5847094058990479, "learning_rate": 0.0002, "epoch": 6.529584557280739, "step": 7780}, {"loss": 1.0885, "grad_norm": 1.161391258239746, "learning_rate": 0.0002, "epoch": 6.53797733948804, "step": 7790}, {"loss": 1.1549, "grad_norm": 1.1106663942337036, "learning_rate": 0.0002, "epoch": 6.546370121695342, "step": 7800}, {"loss": 1.0584, "grad_norm": 1.2467689514160156, "learning_rate": 0.0002, "epoch": 6.554762903902644, "step": 7810}, {"loss": 1.0923, "grad_norm": 1.1907767057418823, "learning_rate": 0.0002, "epoch": 6.563155686109946, "step": 7820}, {"loss": 1.1606, "grad_norm": 1.1521105766296387, "learning_rate": 0.0002, "epoch": 6.571548468317247, "step": 7830}, {"loss": 1.1644, "grad_norm": 1.2498128414154053, "learning_rate": 0.0002, "epoch": 6.5799412505245485, "step": 7840}, {"loss": 1.0948, "grad_norm": 1.1506036520004272, "learning_rate": 0.0002, "epoch": 6.588334032731851, "step": 7850}, {"loss": 1.1499, "grad_norm": 1.118890404701233, "learning_rate": 0.0002, "epoch": 6.596726814939152, "step": 7860}, {"loss": 1.1352, "grad_norm": 1.1001442670822144, "learning_rate": 0.0002, "epoch": 6.605119597146454, "step": 7870}, {"loss": 1.1139, "grad_norm": 1.1551518440246582, "learning_rate": 0.0002, "epoch": 6.613512379353756, "step": 7880}, {"loss": 1.1255, "grad_norm": 1.1872174739837646, "learning_rate": 0.0002, "epoch": 6.621905161561058, "step": 7890}, {"loss": 1.1013, "grad_norm": 1.1665245294570923, "learning_rate": 0.0002, "epoch": 6.630297943768359, "step": 7900}, {"loss": 1.1857, "grad_norm": 1.1592308282852173, "learning_rate": 0.0002, "epoch": 6.6386907259756605, "step": 7910}, {"loss": 1.1639, "grad_norm": 1.2712409496307373, "learning_rate": 0.0002, "epoch": 6.647083508182963, "step": 7920}, {"loss": 1.147, "grad_norm": 1.0665934085845947, "learning_rate": 0.0002, "epoch": 6.655476290390264, "step": 7930}, {"loss": 1.1437, "grad_norm": 1.1843419075012207, "learning_rate": 0.0002, "epoch": 6.663869072597566, "step": 7940}, {"loss": 1.1359, "grad_norm": 1.4945712089538574, "learning_rate": 0.0002, "epoch": 6.6722618548048676, "step": 7950}, {"loss": 1.1772, "grad_norm": 1.3284149169921875, "learning_rate": 0.0002, "epoch": 6.68065463701217, "step": 7960}, {"loss": 1.1183, "grad_norm": 1.1670401096343994, "learning_rate": 0.0002, "epoch": 6.689047419219471, "step": 7970}, {"loss": 1.1808, "grad_norm": 1.1963475942611694, "learning_rate": 0.0002, "epoch": 6.697440201426773, "step": 7980}, {"loss": 1.1489, "grad_norm": 1.077380657196045, "learning_rate": 0.0002, "epoch": 6.705832983634075, "step": 7990}, {"loss": 1.1661, "grad_norm": 0.8758405447006226, "learning_rate": 0.0002, "epoch": 6.714225765841377, "step": 8000}, {"loss": 1.169, "grad_norm": 1.2686632871627808, "learning_rate": 0.0002, "epoch": 6.722618548048678, "step": 8010}, {"loss": 1.1486, "grad_norm": 1.1136665344238281, "learning_rate": 0.0002, "epoch": 6.7310113302559795, "step": 8020}, {"loss": 1.1439, "grad_norm": 1.25029456615448, "learning_rate": 0.0002, "epoch": 6.739404112463282, "step": 8030}, {"loss": 1.1121, "grad_norm": 1.0269629955291748, "learning_rate": 0.0002, "epoch": 6.747796894670583, "step": 8040}, {"loss": 1.1707, "grad_norm": 1.1515758037567139, "learning_rate": 0.0002, "epoch": 6.756189676877885, "step": 8050}, {"loss": 1.1487, "grad_norm": 1.1150308847427368, "learning_rate": 0.0002, "epoch": 6.764582459085187, "step": 8060}, {"loss": 1.088, "grad_norm": 1.025669813156128, "learning_rate": 0.0002, "epoch": 6.772975241292489, "step": 8070}, {"loss": 1.1002, "grad_norm": 1.0564825534820557, "learning_rate": 0.0002, "epoch": 6.78136802349979, "step": 8080}, {"loss": 1.1722, "grad_norm": 1.1695157289505005, "learning_rate": 0.0002, "epoch": 6.7897608057070915, "step": 8090}, {"loss": 1.1322, "grad_norm": 1.1086713075637817, "learning_rate": 0.0002, "epoch": 6.798153587914394, "step": 8100}, {"loss": 1.2036, "grad_norm": 1.0446662902832031, "learning_rate": 0.0002, "epoch": 6.806546370121695, "step": 8110}, {"loss": 1.1106, "grad_norm": 1.2017868757247925, "learning_rate": 0.0002, "epoch": 6.814939152328997, "step": 8120}, {"loss": 1.1316, "grad_norm": 1.2538378238677979, "learning_rate": 0.0002, "epoch": 6.823331934536299, "step": 8130}, {"loss": 1.1506, "grad_norm": 1.1552783250808716, "learning_rate": 0.0002, "epoch": 6.831724716743601, "step": 8140}, {"loss": 1.1623, "grad_norm": 1.2151418924331665, "learning_rate": 0.0002, "epoch": 6.840117498950902, "step": 8150}, {"loss": 1.121, "grad_norm": 1.1431301832199097, "learning_rate": 0.0002, "epoch": 6.8485102811582035, "step": 8160}, {"loss": 1.1312, "grad_norm": 1.0864715576171875, "learning_rate": 0.0002, "epoch": 6.856903063365506, "step": 8170}, {"loss": 1.1777, "grad_norm": 1.2602605819702148, "learning_rate": 0.0002, "epoch": 6.865295845572807, "step": 8180}, {"loss": 1.1237, "grad_norm": 1.1670788526535034, "learning_rate": 0.0002, "epoch": 6.873688627780109, "step": 8190}, {"loss": 1.1728, "grad_norm": 1.1444851160049438, "learning_rate": 0.0002, "epoch": 6.882081409987411, "step": 8200}, {"loss": 1.1208, "grad_norm": 1.1726973056793213, "learning_rate": 0.0002, "epoch": 6.890474192194713, "step": 8210}, {"loss": 1.1666, "grad_norm": 1.0436229705810547, "learning_rate": 0.0002, "epoch": 6.898866974402014, "step": 8220}, {"loss": 1.097, "grad_norm": 1.3296568393707275, "learning_rate": 0.0002, "epoch": 6.907259756609316, "step": 8230}, {"loss": 1.0581, "grad_norm": 1.2561821937561035, "learning_rate": 0.0002, "epoch": 6.915652538816618, "step": 8240}, {"loss": 1.2125, "grad_norm": 1.2071776390075684, "learning_rate": 0.0002, "epoch": 6.92404532102392, "step": 8250}, {"loss": 1.1433, "grad_norm": 1.115523099899292, "learning_rate": 0.0002, "epoch": 6.932438103231221, "step": 8260}, {"loss": 1.2104, "grad_norm": 1.145468831062317, "learning_rate": 0.0002, "epoch": 6.940830885438523, "step": 8270}, {"loss": 1.1654, "grad_norm": 1.2517759799957275, "learning_rate": 0.0002, "epoch": 6.949223667645825, "step": 8280}, {"loss": 1.0968, "grad_norm": 1.1757365465164185, "learning_rate": 0.0002, "epoch": 6.957616449853126, "step": 8290}, {"loss": 1.1899, "grad_norm": 1.0645636320114136, "learning_rate": 0.0002, "epoch": 6.966009232060428, "step": 8300}, {"loss": 1.2665, "grad_norm": 1.2390278577804565, "learning_rate": 0.0002, "epoch": 6.97440201426773, "step": 8310}, {"loss": 1.1491, "grad_norm": 1.202418327331543, "learning_rate": 0.0002, "epoch": 6.982794796475032, "step": 8320}, {"loss": 1.1722, "grad_norm": 1.0840344429016113, "learning_rate": 0.0002, "epoch": 6.991187578682333, "step": 8330}, {"loss": 1.1172, "grad_norm": 1.2504760026931763, "learning_rate": 0.0002, "epoch": 6.9995803608896345, "step": 8340}]} +{"epoch": 7.996642887117079, "step": 9528, "epoch_duration": 1264.711720943451, "total_accumulated_duration": 10306.808114767075, "gpu_info": {"GPU_0": "NVIDIA A100-SXM4-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-5623-sd-0/checkpoint-2383", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.667, "grad_norm": 0.6016407012939453, "learning_rate": 0.0002, "epoch": 0.00839278220730172, "step": 10}, {"loss": 2.2702, "grad_norm": 0.5444163084030151, "learning_rate": 0.0002, "epoch": 0.01678556441460344, "step": 20}, {"loss": 2.004, "grad_norm": 0.5771743059158325, "learning_rate": 0.0002, "epoch": 0.02517834662190516, "step": 30}, {"loss": 1.9819, "grad_norm": 0.5426492094993591, "learning_rate": 0.0002, "epoch": 0.03357112882920688, "step": 40}, {"loss": 2.0078, "grad_norm": 0.5884947180747986, "learning_rate": 0.0002, "epoch": 0.0419639110365086, "step": 50}, {"loss": 1.875, "grad_norm": 0.47584953904151917, "learning_rate": 0.0002, "epoch": 0.05035669324381032, "step": 60}, {"loss": 1.8831, "grad_norm": 0.529290497303009, "learning_rate": 0.0002, "epoch": 0.058749475451112046, "step": 70}, {"loss": 1.9296, "grad_norm": 0.48883911967277527, "learning_rate": 0.0002, "epoch": 0.06714225765841376, "step": 80}, {"loss": 1.8456, "grad_norm": 0.4272284209728241, "learning_rate": 0.0002, "epoch": 0.07553503986571548, "step": 90}, {"loss": 1.9089, "grad_norm": 0.42270252108573914, "learning_rate": 0.0002, "epoch": 0.0839278220730172, "step": 100}, {"loss": 1.8279, "grad_norm": 0.45384910702705383, "learning_rate": 0.0002, "epoch": 0.09232060428031892, "step": 110}, {"loss": 1.9126, "grad_norm": 0.37896445393562317, "learning_rate": 0.0002, "epoch": 0.10071338648762064, "step": 120}, {"loss": 1.8618, "grad_norm": 0.4134417176246643, "learning_rate": 0.0002, "epoch": 0.10910616869492237, "step": 130}, {"loss": 1.8528, "grad_norm": 0.42598405480384827, "learning_rate": 0.0002, "epoch": 0.11749895090222409, "step": 140}, {"loss": 1.8056, "grad_norm": 0.39050817489624023, "learning_rate": 0.0002, "epoch": 0.1258917331095258, "step": 150}, {"loss": 1.8912, "grad_norm": 0.3783605098724365, "learning_rate": 0.0002, "epoch": 0.13428451531682753, "step": 160}, {"loss": 1.9022, "grad_norm": 0.4229804575443268, "learning_rate": 0.0002, "epoch": 0.14267729752412925, "step": 170}, {"loss": 1.8183, "grad_norm": 0.3557824194431305, "learning_rate": 0.0002, "epoch": 0.15107007973143097, "step": 180}, {"loss": 1.8105, "grad_norm": 0.37380388379096985, "learning_rate": 0.0002, "epoch": 0.1594628619387327, "step": 190}, {"loss": 1.907, "grad_norm": 0.3803510367870331, "learning_rate": 0.0002, "epoch": 0.1678556441460344, "step": 200}, {"loss": 1.7942, "grad_norm": 0.5078789591789246, "learning_rate": 0.0002, "epoch": 0.17624842635333612, "step": 210}, {"loss": 1.7683, "grad_norm": 1.8922057151794434, "learning_rate": 0.0002, "epoch": 0.18464120856063784, "step": 220}, {"loss": 1.8617, "grad_norm": 0.36936357617378235, "learning_rate": 0.0002, "epoch": 0.19303399076793956, "step": 230}, {"loss": 1.7896, "grad_norm": 0.41423121094703674, "learning_rate": 0.0002, "epoch": 0.20142677297524128, "step": 240}, {"loss": 1.8249, "grad_norm": 0.3869935870170593, "learning_rate": 0.0002, "epoch": 0.209819555182543, "step": 250}, {"loss": 1.7615, "grad_norm": 0.35073965787887573, "learning_rate": 0.0002, "epoch": 0.21821233738984475, "step": 260}, {"loss": 1.8142, "grad_norm": 0.3748358190059662, "learning_rate": 0.0002, "epoch": 0.22660511959714646, "step": 270}, {"loss": 1.8534, "grad_norm": 0.36887043714523315, "learning_rate": 0.0002, "epoch": 0.23499790180444818, "step": 280}, {"loss": 1.8645, "grad_norm": 0.36038365960121155, "learning_rate": 0.0002, "epoch": 0.2433906840117499, "step": 290}, {"loss": 1.7983, "grad_norm": 0.36350926756858826, "learning_rate": 0.0002, "epoch": 0.2517834662190516, "step": 300}, {"loss": 1.8339, "grad_norm": 0.351936936378479, "learning_rate": 0.0002, "epoch": 0.26017624842635334, "step": 310}, {"loss": 1.7953, "grad_norm": 0.35942426323890686, "learning_rate": 0.0002, "epoch": 0.26856903063365506, "step": 320}, {"loss": 1.8205, "grad_norm": 0.39852434396743774, "learning_rate": 0.0002, "epoch": 0.2769618128409568, "step": 330}, {"loss": 1.8598, "grad_norm": 0.3282669186592102, "learning_rate": 0.0002, "epoch": 0.2853545950482585, "step": 340}, {"loss": 1.8164, "grad_norm": 0.3388650417327881, "learning_rate": 0.0002, "epoch": 0.2937473772555602, "step": 350}, {"loss": 1.784, "grad_norm": 0.31616076827049255, "learning_rate": 0.0002, "epoch": 0.30214015946286193, "step": 360}, {"loss": 1.8365, "grad_norm": 0.34184730052948, "learning_rate": 0.0002, "epoch": 0.31053294167016365, "step": 370}, {"loss": 1.8051, "grad_norm": 0.3599095344543457, "learning_rate": 0.0002, "epoch": 0.3189257238774654, "step": 380}, {"loss": 1.8274, "grad_norm": 0.3970130681991577, "learning_rate": 0.0002, "epoch": 0.3273185060847671, "step": 390}, {"loss": 1.7976, "grad_norm": 0.40854907035827637, "learning_rate": 0.0002, "epoch": 0.3357112882920688, "step": 400}, {"loss": 1.8403, "grad_norm": 0.33014851808547974, "learning_rate": 0.0002, "epoch": 0.34410407049937053, "step": 410}, {"loss": 1.825, "grad_norm": 0.3269062042236328, "learning_rate": 0.0002, "epoch": 0.35249685270667225, "step": 420}, {"loss": 1.7968, "grad_norm": 0.35455429553985596, "learning_rate": 0.0002, "epoch": 0.36088963491397397, "step": 430}, {"loss": 1.8299, "grad_norm": 0.34339913725852966, "learning_rate": 0.0002, "epoch": 0.3692824171212757, "step": 440}, {"loss": 1.8525, "grad_norm": 0.34326961636543274, "learning_rate": 0.0002, "epoch": 0.3776751993285774, "step": 450}, {"loss": 1.7931, "grad_norm": 0.33944424986839294, "learning_rate": 0.0002, "epoch": 0.3860679815358791, "step": 460}, {"loss": 1.8445, "grad_norm": 0.3673107326030731, "learning_rate": 0.0002, "epoch": 0.39446076374318084, "step": 470}, {"loss": 1.7105, "grad_norm": 0.40028971433639526, "learning_rate": 0.0002, "epoch": 0.40285354595048256, "step": 480}, {"loss": 1.7771, "grad_norm": 0.4117187261581421, "learning_rate": 0.0002, "epoch": 0.4112463281577843, "step": 490}, {"loss": 1.768, "grad_norm": 0.31541067361831665, "learning_rate": 0.0002, "epoch": 0.419639110365086, "step": 500}, {"loss": 1.7757, "grad_norm": 0.32634997367858887, "learning_rate": 0.0002, "epoch": 0.4280318925723878, "step": 510}, {"loss": 1.793, "grad_norm": 0.3255768120288849, "learning_rate": 0.0002, "epoch": 0.4364246747796895, "step": 520}, {"loss": 1.7375, "grad_norm": 0.34764620661735535, "learning_rate": 0.0002, "epoch": 0.4448174569869912, "step": 530}, {"loss": 1.8421, "grad_norm": 0.36379843950271606, "learning_rate": 0.0002, "epoch": 0.45321023919429293, "step": 540}, {"loss": 1.8103, "grad_norm": 0.37775811553001404, "learning_rate": 0.0002, "epoch": 0.46160302140159465, "step": 550}, {"loss": 1.7982, "grad_norm": 0.3421199917793274, "learning_rate": 0.0002, "epoch": 0.46999580360889637, "step": 560}, {"loss": 1.7753, "grad_norm": 0.3447427749633789, "learning_rate": 0.0002, "epoch": 0.4783885858161981, "step": 570}, {"loss": 1.765, "grad_norm": 0.38283416628837585, "learning_rate": 0.0002, "epoch": 0.4867813680234998, "step": 580}, {"loss": 1.7945, "grad_norm": 0.34281104803085327, "learning_rate": 0.0002, "epoch": 0.4951741502308015, "step": 590}, {"loss": 1.6907, "grad_norm": 0.35317757725715637, "learning_rate": 0.0002, "epoch": 0.5035669324381032, "step": 600}, {"loss": 1.829, "grad_norm": 0.34344494342803955, "learning_rate": 0.0002, "epoch": 0.5119597146454049, "step": 610}, {"loss": 1.84, "grad_norm": 0.3168846666812897, "learning_rate": 0.0002, "epoch": 0.5203524968527067, "step": 620}, {"loss": 1.8811, "grad_norm": 0.570289671421051, "learning_rate": 0.0002, "epoch": 0.5287452790600083, "step": 630}, {"loss": 1.707, "grad_norm": 0.32985877990722656, "learning_rate": 0.0002, "epoch": 0.5371380612673101, "step": 640}, {"loss": 1.8455, "grad_norm": 0.418250173330307, "learning_rate": 0.0002, "epoch": 0.5455308434746118, "step": 650}, {"loss": 1.7127, "grad_norm": 0.34269577264785767, "learning_rate": 0.0002, "epoch": 0.5539236256819136, "step": 660}, {"loss": 1.7964, "grad_norm": 0.6531919240951538, "learning_rate": 0.0002, "epoch": 0.5623164078892152, "step": 670}, {"loss": 1.7499, "grad_norm": 0.3711959719657898, "learning_rate": 0.0002, "epoch": 0.570709190096517, "step": 680}, {"loss": 1.802, "grad_norm": 0.3916425108909607, "learning_rate": 0.0002, "epoch": 0.5791019723038188, "step": 690}, {"loss": 1.8752, "grad_norm": 0.31316208839416504, "learning_rate": 0.0002, "epoch": 0.5874947545111204, "step": 700}, {"loss": 1.8222, "grad_norm": 0.35153743624687195, "learning_rate": 0.0002, "epoch": 0.5958875367184222, "step": 710}, {"loss": 1.7817, "grad_norm": 0.34590575098991394, "learning_rate": 0.0002, "epoch": 0.6042803189257239, "step": 720}, {"loss": 1.8062, "grad_norm": 0.2984001040458679, "learning_rate": 0.0002, "epoch": 0.6126731011330256, "step": 730}, {"loss": 1.8118, "grad_norm": 0.3588712513446808, "learning_rate": 0.0002, "epoch": 0.6210658833403273, "step": 740}, {"loss": 1.7652, "grad_norm": 0.3288203179836273, "learning_rate": 0.0002, "epoch": 0.6294586655476291, "step": 750}, {"loss": 1.799, "grad_norm": 0.3102910816669464, "learning_rate": 0.0002, "epoch": 0.6378514477549307, "step": 760}, {"loss": 1.8746, "grad_norm": 0.42002803087234497, "learning_rate": 0.0002, "epoch": 0.6462442299622325, "step": 770}, {"loss": 1.8726, "grad_norm": 0.35616543889045715, "learning_rate": 0.0002, "epoch": 0.6546370121695342, "step": 780}, {"loss": 1.8118, "grad_norm": 0.37670427560806274, "learning_rate": 0.0002, "epoch": 0.663029794376836, "step": 790}, {"loss": 1.7676, "grad_norm": 0.3410654664039612, "learning_rate": 0.0002, "epoch": 0.6714225765841376, "step": 800}, {"loss": 1.7782, "grad_norm": 0.2916128635406494, "learning_rate": 0.0002, "epoch": 0.6798153587914394, "step": 810}, {"loss": 1.8057, "grad_norm": 0.3147228956222534, "learning_rate": 0.0002, "epoch": 0.6882081409987411, "step": 820}, {"loss": 1.7826, "grad_norm": 0.3593887984752655, "learning_rate": 0.0002, "epoch": 0.6966009232060428, "step": 830}, {"loss": 1.754, "grad_norm": 0.29242461919784546, "learning_rate": 0.0002, "epoch": 0.7049937054133445, "step": 840}, {"loss": 1.8083, "grad_norm": 0.32993558049201965, "learning_rate": 0.0002, "epoch": 0.7133864876206463, "step": 850}, {"loss": 1.6948, "grad_norm": 0.3939134478569031, "learning_rate": 0.0002, "epoch": 0.7217792698279479, "step": 860}, {"loss": 1.8261, "grad_norm": 0.3476874828338623, "learning_rate": 0.0002, "epoch": 0.7301720520352497, "step": 870}, {"loss": 1.8127, "grad_norm": 0.324367880821228, "learning_rate": 0.0002, "epoch": 0.7385648342425514, "step": 880}, {"loss": 1.7533, "grad_norm": 0.29460495710372925, "learning_rate": 0.0002, "epoch": 0.7469576164498531, "step": 890}, {"loss": 1.7544, "grad_norm": 0.37918367981910706, "learning_rate": 0.0002, "epoch": 0.7553503986571548, "step": 900}, {"loss": 1.7579, "grad_norm": 0.3517799973487854, "learning_rate": 0.0002, "epoch": 0.7637431808644566, "step": 910}, {"loss": 1.7895, "grad_norm": 0.3069603443145752, "learning_rate": 0.0002, "epoch": 0.7721359630717582, "step": 920}, {"loss": 1.7589, "grad_norm": 0.3776717483997345, "learning_rate": 0.0002, "epoch": 0.78052874527906, "step": 930}, {"loss": 1.8663, "grad_norm": 0.4474868178367615, "learning_rate": 0.0002, "epoch": 0.7889215274863617, "step": 940}, {"loss": 1.7976, "grad_norm": 0.3259398639202118, "learning_rate": 0.0002, "epoch": 0.7973143096936635, "step": 950}, {"loss": 1.7827, "grad_norm": 0.3109343647956848, "learning_rate": 0.0002, "epoch": 0.8057070919009651, "step": 960}, {"loss": 1.8035, "grad_norm": 0.3707215189933777, "learning_rate": 0.0002, "epoch": 0.8140998741082669, "step": 970}, {"loss": 1.851, "grad_norm": 0.3671801686286926, "learning_rate": 0.0002, "epoch": 0.8224926563155686, "step": 980}, {"loss": 1.7351, "grad_norm": 0.3278632164001465, "learning_rate": 0.0002, "epoch": 0.8308854385228703, "step": 990}, {"loss": 1.7679, "grad_norm": 0.32587629556655884, "learning_rate": 0.0002, "epoch": 0.839278220730172, "step": 1000}, {"loss": 1.7563, "grad_norm": 0.3705422878265381, "learning_rate": 0.0002, "epoch": 0.8476710029374738, "step": 1010}, {"loss": 1.7723, "grad_norm": 0.43461498618125916, "learning_rate": 0.0002, "epoch": 0.8560637851447755, "step": 1020}, {"loss": 1.7528, "grad_norm": 0.30326616764068604, "learning_rate": 0.0002, "epoch": 0.8644565673520772, "step": 1030}, {"loss": 1.7688, "grad_norm": 0.3383970260620117, "learning_rate": 0.0002, "epoch": 0.872849349559379, "step": 1040}, {"loss": 1.7701, "grad_norm": 0.3041667640209198, "learning_rate": 0.0002, "epoch": 0.8812421317666806, "step": 1050}, {"loss": 1.8515, "grad_norm": 0.4173165261745453, "learning_rate": 0.0002, "epoch": 0.8896349139739824, "step": 1060}, {"loss": 1.8217, "grad_norm": 0.394760400056839, "learning_rate": 0.0002, "epoch": 0.8980276961812841, "step": 1070}, {"loss": 1.7425, "grad_norm": 0.32503336668014526, "learning_rate": 0.0002, "epoch": 0.9064204783885859, "step": 1080}, {"loss": 1.7712, "grad_norm": 0.339996337890625, "learning_rate": 0.0002, "epoch": 0.9148132605958875, "step": 1090}, {"loss": 1.7893, "grad_norm": 0.3512224555015564, "learning_rate": 0.0002, "epoch": 0.9232060428031893, "step": 1100}, {"loss": 1.8027, "grad_norm": 0.458159863948822, "learning_rate": 0.0002, "epoch": 0.931598825010491, "step": 1110}, {"loss": 1.7974, "grad_norm": 0.3467862904071808, "learning_rate": 0.0002, "epoch": 0.9399916072177927, "step": 1120}, {"loss": 1.836, "grad_norm": 0.3274364173412323, "learning_rate": 0.0002, "epoch": 0.9483843894250944, "step": 1130}, {"loss": 1.7669, "grad_norm": 0.3269580006599426, "learning_rate": 0.0002, "epoch": 0.9567771716323962, "step": 1140}, {"loss": 1.8383, "grad_norm": 0.31564876437187195, "learning_rate": 0.0002, "epoch": 0.9651699538396978, "step": 1150}, {"loss": 1.782, "grad_norm": 0.32907289266586304, "learning_rate": 0.0002, "epoch": 0.9735627360469996, "step": 1160}, {"loss": 1.717, "grad_norm": 0.3564138412475586, "learning_rate": 0.0002, "epoch": 0.9819555182543013, "step": 1170}, {"loss": 1.7615, "grad_norm": 0.32875651121139526, "learning_rate": 0.0002, "epoch": 0.990348300461603, "step": 1180}, {"loss": 1.7232, "grad_norm": 0.3225541114807129, "learning_rate": 0.0002, "epoch": 0.9987410826689047, "step": 1190}, {"eval_loss": 1.8086129426956177, "eval_runtime": 38.0431, "eval_samples_per_second": 13.537, "eval_steps_per_second": 1.709, "epoch": 0.9995803608896349, "step": 1191}, {"loss": 1.6856, "grad_norm": 0.3235187232494354, "learning_rate": 0.0002, "epoch": 1.0071338648762065, "step": 1200}, {"loss": 1.7121, "grad_norm": 0.34884774684906006, "learning_rate": 0.0002, "epoch": 1.0155266470835083, "step": 1210}, {"loss": 1.6779, "grad_norm": 0.3215438425540924, "learning_rate": 0.0002, "epoch": 1.0239194292908098, "step": 1220}, {"loss": 1.6562, "grad_norm": 0.312084823846817, "learning_rate": 0.0002, "epoch": 1.0323122114981116, "step": 1230}, {"loss": 1.7366, "grad_norm": 0.33597758412361145, "learning_rate": 0.0002, "epoch": 1.0407049937054134, "step": 1240}, {"loss": 1.7245, "grad_norm": 0.3421499729156494, "learning_rate": 0.0002, "epoch": 1.0490977759127151, "step": 1250}, {"loss": 1.7331, "grad_norm": 0.3458889126777649, "learning_rate": 0.0002, "epoch": 1.0574905581200167, "step": 1260}, {"loss": 1.6929, "grad_norm": 0.3956579864025116, "learning_rate": 0.0002, "epoch": 1.0658833403273185, "step": 1270}, {"loss": 1.6625, "grad_norm": 0.3217819035053253, "learning_rate": 0.0002, "epoch": 1.0742761225346202, "step": 1280}, {"loss": 1.7488, "grad_norm": 0.31379663944244385, "learning_rate": 0.0002, "epoch": 1.082668904741922, "step": 1290}, {"loss": 1.6331, "grad_norm": 0.37231558561325073, "learning_rate": 0.0002, "epoch": 1.0910616869492236, "step": 1300}, {"loss": 1.6614, "grad_norm": 0.35857918858528137, "learning_rate": 0.0002, "epoch": 1.0994544691565253, "step": 1310}, {"loss": 1.7344, "grad_norm": 0.36637991666793823, "learning_rate": 0.0002, "epoch": 1.1078472513638271, "step": 1320}, {"loss": 1.7245, "grad_norm": 0.3436494469642639, "learning_rate": 0.0002, "epoch": 1.1162400335711289, "step": 1330}, {"loss": 1.6867, "grad_norm": 0.404908150434494, "learning_rate": 0.0002, "epoch": 1.1246328157784307, "step": 1340}, {"loss": 1.7042, "grad_norm": 0.34587544202804565, "learning_rate": 0.0002, "epoch": 1.1330255979857322, "step": 1350}, {"loss": 1.6365, "grad_norm": 0.35142362117767334, "learning_rate": 0.0002, "epoch": 1.141418380193034, "step": 1360}, {"loss": 1.6781, "grad_norm": 0.3511804938316345, "learning_rate": 0.0002, "epoch": 1.1498111624003358, "step": 1370}, {"loss": 1.6824, "grad_norm": 0.3549560308456421, "learning_rate": 0.0002, "epoch": 1.1582039446076373, "step": 1380}, {"loss": 1.7276, "grad_norm": 0.35797521471977234, "learning_rate": 0.0002, "epoch": 1.166596726814939, "step": 1390}, {"loss": 1.7476, "grad_norm": 0.37255269289016724, "learning_rate": 0.0002, "epoch": 1.1749895090222409, "step": 1400}, {"loss": 1.7274, "grad_norm": 0.3680652379989624, "learning_rate": 0.0002, "epoch": 1.1833822912295426, "step": 1410}, {"loss": 1.6751, "grad_norm": 0.400831013917923, "learning_rate": 0.0002, "epoch": 1.1917750734368444, "step": 1420}, {"loss": 1.7961, "grad_norm": 0.39571020007133484, "learning_rate": 0.0002, "epoch": 1.200167855644146, "step": 1430}, {"loss": 1.792, "grad_norm": 0.3843863010406494, "learning_rate": 0.0002, "epoch": 1.2085606378514477, "step": 1440}, {"loss": 1.7072, "grad_norm": 0.3901960551738739, "learning_rate": 0.0002, "epoch": 1.2169534200587495, "step": 1450}, {"loss": 1.6425, "grad_norm": 0.36490726470947266, "learning_rate": 0.0002, "epoch": 1.2253462022660513, "step": 1460}, {"loss": 1.6995, "grad_norm": 0.3739864230155945, "learning_rate": 0.0002, "epoch": 1.2337389844733528, "step": 1470}, {"loss": 1.6795, "grad_norm": 0.39061254262924194, "learning_rate": 0.0002, "epoch": 1.2421317666806546, "step": 1480}, {"loss": 1.6838, "grad_norm": 0.37198659777641296, "learning_rate": 0.0002, "epoch": 1.2505245488879564, "step": 1490}, {"loss": 1.725, "grad_norm": 0.3420586884021759, "learning_rate": 0.0002, "epoch": 1.2589173310952582, "step": 1500}, {"loss": 1.719, "grad_norm": 0.4094347655773163, "learning_rate": 0.0002, "epoch": 1.2673101133025597, "step": 1510}, {"loss": 1.7563, "grad_norm": 0.38997703790664673, "learning_rate": 0.0002, "epoch": 1.2757028955098615, "step": 1520}, {"loss": 1.6651, "grad_norm": 0.35702022910118103, "learning_rate": 0.0002, "epoch": 1.2840956777171633, "step": 1530}, {"loss": 1.6689, "grad_norm": 0.3892163336277008, "learning_rate": 0.0002, "epoch": 1.292488459924465, "step": 1540}, {"loss": 1.7209, "grad_norm": 0.33174318075180054, "learning_rate": 0.0002, "epoch": 1.3008812421317666, "step": 1550}, {"loss": 1.7581, "grad_norm": 0.40701809525489807, "learning_rate": 0.0002, "epoch": 1.3092740243390684, "step": 1560}, {"loss": 1.7229, "grad_norm": 0.36324232816696167, "learning_rate": 0.0002, "epoch": 1.3176668065463701, "step": 1570}, {"loss": 1.6708, "grad_norm": 0.3748789429664612, "learning_rate": 0.0002, "epoch": 1.326059588753672, "step": 1580}, {"loss": 1.67, "grad_norm": 0.40873438119888306, "learning_rate": 0.0002, "epoch": 1.3344523709609737, "step": 1590}, {"loss": 1.7909, "grad_norm": 0.52373206615448, "learning_rate": 0.0002, "epoch": 1.3428451531682752, "step": 1600}, {"loss": 1.7593, "grad_norm": 0.40408164262771606, "learning_rate": 0.0002, "epoch": 1.351237935375577, "step": 1610}, {"loss": 1.7959, "grad_norm": 0.3818126320838928, "learning_rate": 0.0002, "epoch": 1.3596307175828788, "step": 1620}, {"loss": 1.6328, "grad_norm": 0.3457068204879761, "learning_rate": 0.0002, "epoch": 1.3680234997901803, "step": 1630}, {"loss": 1.7017, "grad_norm": 0.33777865767478943, "learning_rate": 0.0002, "epoch": 1.3764162819974821, "step": 1640}, {"loss": 1.7335, "grad_norm": 0.36344218254089355, "learning_rate": 0.0002, "epoch": 1.384809064204784, "step": 1650}, {"loss": 1.7656, "grad_norm": 0.3880128562450409, "learning_rate": 0.0002, "epoch": 1.3932018464120857, "step": 1660}, {"loss": 1.7377, "grad_norm": 0.3906225562095642, "learning_rate": 0.0002, "epoch": 1.4015946286193874, "step": 1670}, {"loss": 1.7041, "grad_norm": 0.35857489705085754, "learning_rate": 0.0002, "epoch": 1.409987410826689, "step": 1680}, {"loss": 1.7175, "grad_norm": 0.3627418279647827, "learning_rate": 0.0002, "epoch": 1.4183801930339908, "step": 1690}, {"loss": 1.6948, "grad_norm": 0.41963326930999756, "learning_rate": 0.0002, "epoch": 1.4267729752412925, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.36280378699302673, "learning_rate": 0.0002, "epoch": 1.435165757448594, "step": 1710}, {"loss": 1.7775, "grad_norm": 0.3868233561515808, "learning_rate": 0.0002, "epoch": 1.4435585396558959, "step": 1720}, {"loss": 1.6963, "grad_norm": 0.3635849356651306, "learning_rate": 0.0002, "epoch": 1.4519513218631976, "step": 1730}, {"loss": 1.7381, "grad_norm": 0.4885194003582001, "learning_rate": 0.0002, "epoch": 1.4603441040704994, "step": 1740}, {"loss": 1.6661, "grad_norm": 0.35194680094718933, "learning_rate": 0.0002, "epoch": 1.4687368862778012, "step": 1750}, {"loss": 1.7841, "grad_norm": 0.34906691312789917, "learning_rate": 0.0002, "epoch": 1.4771296684851027, "step": 1760}, {"loss": 1.7196, "grad_norm": 0.3994184732437134, "learning_rate": 0.0002, "epoch": 1.4855224506924045, "step": 1770}, {"loss": 1.7157, "grad_norm": 0.3599298298358917, "learning_rate": 0.0002, "epoch": 1.4939152328997063, "step": 1780}, {"loss": 1.6966, "grad_norm": 0.3794984221458435, "learning_rate": 0.0002, "epoch": 1.5023080151070078, "step": 1790}, {"loss": 1.7187, "grad_norm": 0.36289724707603455, "learning_rate": 0.0002, "epoch": 1.5107007973143096, "step": 1800}, {"loss": 1.78, "grad_norm": 0.38057321310043335, "learning_rate": 0.0002, "epoch": 1.5190935795216114, "step": 1810}, {"loss": 1.7006, "grad_norm": 0.3771969676017761, "learning_rate": 0.0002, "epoch": 1.5274863617289132, "step": 1820}, {"loss": 1.765, "grad_norm": 0.34788841009140015, "learning_rate": 0.0002, "epoch": 1.535879143936215, "step": 1830}, {"loss": 1.7148, "grad_norm": 0.41352227330207825, "learning_rate": 0.0002, "epoch": 1.5442719261435167, "step": 1840}, {"loss": 1.6654, "grad_norm": 0.35711410641670227, "learning_rate": 0.0002, "epoch": 1.5526647083508183, "step": 1850}, {"loss": 1.6998, "grad_norm": 0.40607622265815735, "learning_rate": 0.0002, "epoch": 1.56105749055812, "step": 1860}, {"loss": 1.713, "grad_norm": 0.3428550660610199, "learning_rate": 0.0002, "epoch": 1.5694502727654216, "step": 1870}, {"loss": 1.7909, "grad_norm": 0.3695414066314697, "learning_rate": 0.0002, "epoch": 1.5778430549727234, "step": 1880}, {"loss": 1.6629, "grad_norm": 0.3798272907733917, "learning_rate": 0.0002, "epoch": 1.5862358371800251, "step": 1890}, {"loss": 1.7412, "grad_norm": 0.3415829837322235, "learning_rate": 0.0002, "epoch": 1.594628619387327, "step": 1900}, {"loss": 1.8233, "grad_norm": 0.3575693666934967, "learning_rate": 0.0002, "epoch": 1.6030214015946287, "step": 1910}, {"loss": 1.6947, "grad_norm": 0.3180370628833771, "learning_rate": 0.0002, "epoch": 1.6114141838019305, "step": 1920}, {"loss": 1.7506, "grad_norm": 0.5018689036369324, "learning_rate": 0.0002, "epoch": 1.619806966009232, "step": 1930}, {"loss": 1.7368, "grad_norm": 0.35676372051239014, "learning_rate": 0.0002, "epoch": 1.6281997482165338, "step": 1940}, {"loss": 1.7159, "grad_norm": 0.3740452229976654, "learning_rate": 0.0002, "epoch": 1.6365925304238353, "step": 1950}, {"loss": 1.6474, "grad_norm": 0.36584731936454773, "learning_rate": 0.0002, "epoch": 1.6449853126311371, "step": 1960}, {"loss": 1.7306, "grad_norm": 0.38556376099586487, "learning_rate": 0.0002, "epoch": 1.653378094838439, "step": 1970}, {"loss": 1.7694, "grad_norm": 0.4114968776702881, "learning_rate": 0.0002, "epoch": 1.6617708770457407, "step": 1980}, {"loss": 1.6407, "grad_norm": 0.3665498197078705, "learning_rate": 0.0002, "epoch": 1.6701636592530424, "step": 1990}, {"loss": 1.7167, "grad_norm": 0.36579379439353943, "learning_rate": 0.0002, "epoch": 1.6785564414603442, "step": 2000}, {"loss": 1.7637, "grad_norm": 0.3813064694404602, "learning_rate": 0.0002, "epoch": 1.6869492236676458, "step": 2010}, {"loss": 1.7566, "grad_norm": 0.33390694856643677, "learning_rate": 0.0002, "epoch": 1.6953420058749475, "step": 2020}, {"loss": 1.6576, "grad_norm": 0.3668614327907562, "learning_rate": 0.0002, "epoch": 1.7037347880822493, "step": 2030}, {"loss": 1.7162, "grad_norm": 0.352028489112854, "learning_rate": 0.0002, "epoch": 1.7121275702895509, "step": 2040}, {"loss": 1.727, "grad_norm": 0.33639830350875854, "learning_rate": 0.0002, "epoch": 1.7205203524968526, "step": 2050}, {"loss": 1.7868, "grad_norm": 0.39217695593833923, "learning_rate": 0.0002, "epoch": 1.7289131347041544, "step": 2060}, {"loss": 1.7608, "grad_norm": 0.42593324184417725, "learning_rate": 0.0002, "epoch": 1.7373059169114562, "step": 2070}, {"loss": 1.722, "grad_norm": 0.362215518951416, "learning_rate": 0.0002, "epoch": 1.745698699118758, "step": 2080}, {"loss": 1.7712, "grad_norm": 0.4087955057621002, "learning_rate": 0.0002, "epoch": 1.7540914813260597, "step": 2090}, {"loss": 1.6414, "grad_norm": 0.35127750039100647, "learning_rate": 0.0002, "epoch": 1.7624842635333613, "step": 2100}, {"loss": 1.7405, "grad_norm": 0.33677494525909424, "learning_rate": 0.0002, "epoch": 1.770877045740663, "step": 2110}, {"loss": 1.7478, "grad_norm": 0.39616644382476807, "learning_rate": 0.0002, "epoch": 1.7792698279479646, "step": 2120}, {"loss": 1.8068, "grad_norm": 0.4705100953578949, "learning_rate": 0.0002, "epoch": 1.7876626101552664, "step": 2130}, {"loss": 1.75, "grad_norm": 0.3893914818763733, "learning_rate": 0.0002, "epoch": 1.7960553923625682, "step": 2140}, {"loss": 1.6711, "grad_norm": 0.3344813585281372, "learning_rate": 0.0002, "epoch": 1.80444817456987, "step": 2150}, {"loss": 1.8329, "grad_norm": 0.36502110958099365, "learning_rate": 0.0002, "epoch": 1.8128409567771717, "step": 2160}, {"loss": 1.753, "grad_norm": 0.3422985374927521, "learning_rate": 0.0002, "epoch": 1.8212337389844735, "step": 2170}, {"loss": 1.6874, "grad_norm": 0.44039851427078247, "learning_rate": 0.0002, "epoch": 1.829626521191775, "step": 2180}, {"loss": 1.7706, "grad_norm": 0.40052926540374756, "learning_rate": 0.0002, "epoch": 1.8380193033990768, "step": 2190}, {"loss": 1.7551, "grad_norm": 0.3614487648010254, "learning_rate": 0.0002, "epoch": 1.8464120856063784, "step": 2200}, {"loss": 1.6879, "grad_norm": 0.3800305426120758, "learning_rate": 0.0002, "epoch": 1.8548048678136801, "step": 2210}, {"loss": 1.7731, "grad_norm": 0.3942040205001831, "learning_rate": 0.0002, "epoch": 1.863197650020982, "step": 2220}, {"loss": 1.7187, "grad_norm": 0.36896875500679016, "learning_rate": 0.0002, "epoch": 1.8715904322282837, "step": 2230}, {"loss": 1.7371, "grad_norm": 0.3666089177131653, "learning_rate": 0.0002, "epoch": 1.8799832144355855, "step": 2240}, {"loss": 1.7336, "grad_norm": 0.3759142756462097, "learning_rate": 0.0002, "epoch": 1.8883759966428872, "step": 2250}, {"loss": 1.7243, "grad_norm": 0.3711695671081543, "learning_rate": 0.0002, "epoch": 1.8967687788501888, "step": 2260}, {"loss": 1.7052, "grad_norm": 0.37000006437301636, "learning_rate": 0.0002, "epoch": 1.9051615610574906, "step": 2270}, {"loss": 1.7104, "grad_norm": 0.37376025319099426, "learning_rate": 0.0002, "epoch": 1.9135543432647921, "step": 2280}, {"loss": 1.6641, "grad_norm": 0.3794068694114685, "learning_rate": 0.0002, "epoch": 1.921947125472094, "step": 2290}, {"loss": 1.7693, "grad_norm": 0.42530709505081177, "learning_rate": 0.0002, "epoch": 1.9303399076793957, "step": 2300}, {"loss": 1.7871, "grad_norm": 0.3381672203540802, "learning_rate": 0.0002, "epoch": 1.9387326898866974, "step": 2310}, {"loss": 1.7502, "grad_norm": 0.3553236722946167, "learning_rate": 0.0002, "epoch": 1.9471254720939992, "step": 2320}, {"loss": 1.715, "grad_norm": 0.38204774260520935, "learning_rate": 0.0002, "epoch": 1.955518254301301, "step": 2330}, {"loss": 1.7088, "grad_norm": 0.4318946301937103, "learning_rate": 0.0002, "epoch": 1.9639110365086025, "step": 2340}, {"loss": 1.7709, "grad_norm": 0.3563119173049927, "learning_rate": 0.0002, "epoch": 1.9723038187159043, "step": 2350}, {"loss": 1.7083, "grad_norm": 0.362532377243042, "learning_rate": 0.0002, "epoch": 1.980696600923206, "step": 2360}, {"loss": 1.6992, "grad_norm": 0.40200483798980713, "learning_rate": 0.0002, "epoch": 1.9890893831305076, "step": 2370}, {"loss": 1.7622, "grad_norm": 0.37397003173828125, "learning_rate": 0.0002, "epoch": 1.9974821653378094, "step": 2380}, {"eval_loss": 1.807437539100647, "eval_runtime": 38.0038, "eval_samples_per_second": 13.551, "eval_steps_per_second": 1.71, "epoch": 2.0, "step": 2383}, {"loss": 1.579, "grad_norm": 0.3563518226146698, "learning_rate": 0.0002, "epoch": 2.005874947545111, "step": 2390}, {"loss": 1.5467, "grad_norm": 0.3913732171058655, "learning_rate": 0.0002, "epoch": 2.014267729752413, "step": 2400}, {"loss": 1.6202, "grad_norm": 0.3511047661304474, "learning_rate": 0.0002, "epoch": 2.0226605119597147, "step": 2410}, {"loss": 1.599, "grad_norm": 0.3917897641658783, "learning_rate": 0.0002, "epoch": 2.0310532941670165, "step": 2420}, {"loss": 1.663, "grad_norm": 0.36766913533210754, "learning_rate": 0.0002, "epoch": 2.0394460763743183, "step": 2430}, {"loss": 1.5608, "grad_norm": 0.434097021818161, "learning_rate": 0.0002, "epoch": 2.0478388585816196, "step": 2440}, {"loss": 1.6199, "grad_norm": 0.4986756145954132, "learning_rate": 0.0002, "epoch": 2.0562316407889214, "step": 2450}, {"loss": 1.6224, "grad_norm": 0.4377020001411438, "learning_rate": 0.0002, "epoch": 2.064624422996223, "step": 2460}, {"loss": 1.6047, "grad_norm": 0.4412095546722412, "learning_rate": 0.0002, "epoch": 2.073017205203525, "step": 2470}, {"loss": 1.6766, "grad_norm": 0.4463737905025482, "learning_rate": 0.0002, "epoch": 2.0814099874108267, "step": 2480}, {"loss": 1.6666, "grad_norm": 0.4118853211402893, "learning_rate": 0.0002, "epoch": 2.0898027696181285, "step": 2490}, {"loss": 1.6384, "grad_norm": 0.48814308643341064, "learning_rate": 0.0002, "epoch": 2.0981955518254303, "step": 2500}, {"loss": 1.6292, "grad_norm": 0.4263038635253906, "learning_rate": 0.0002, "epoch": 2.106588334032732, "step": 2510}, {"loss": 1.5907, "grad_norm": 0.41060999035835266, "learning_rate": 0.0002, "epoch": 2.1149811162400334, "step": 2520}, {"loss": 1.685, "grad_norm": 0.4699285626411438, "learning_rate": 0.0002, "epoch": 2.123373898447335, "step": 2530}, {"loss": 1.6076, "grad_norm": 0.4321298897266388, "learning_rate": 0.0002, "epoch": 2.131766680654637, "step": 2540}, {"loss": 1.5715, "grad_norm": 0.41544368863105774, "learning_rate": 0.0002, "epoch": 2.1401594628619387, "step": 2550}, {"loss": 1.6717, "grad_norm": 0.4529191851615906, "learning_rate": 0.0002, "epoch": 2.1485522450692405, "step": 2560}, {"loss": 1.7014, "grad_norm": 0.4370215833187103, "learning_rate": 0.0002, "epoch": 2.1569450272765422, "step": 2570}, {"loss": 1.55, "grad_norm": 0.3878629207611084, "learning_rate": 0.0002, "epoch": 2.165337809483844, "step": 2580}, {"loss": 1.6863, "grad_norm": 0.47374191880226135, "learning_rate": 0.0002, "epoch": 2.173730591691146, "step": 2590}, {"loss": 1.6462, "grad_norm": 0.4551556706428528, "learning_rate": 0.0002, "epoch": 2.182123373898447, "step": 2600}, {"loss": 1.6238, "grad_norm": 0.45371633768081665, "learning_rate": 0.0002, "epoch": 2.190516156105749, "step": 2610}, {"loss": 1.6134, "grad_norm": 0.3831859529018402, "learning_rate": 0.0002, "epoch": 2.1989089383130507, "step": 2620}, {"loss": 1.6477, "grad_norm": 0.42436569929122925, "learning_rate": 0.0002, "epoch": 2.2073017205203525, "step": 2630}, {"loss": 1.6512, "grad_norm": 0.4363750219345093, "learning_rate": 0.0002, "epoch": 2.2156945027276542, "step": 2640}, {"loss": 1.6978, "grad_norm": 0.4473390579223633, "learning_rate": 0.0002, "epoch": 2.224087284934956, "step": 2650}, {"loss": 1.6161, "grad_norm": 0.4419533908367157, "learning_rate": 0.0002, "epoch": 2.2324800671422578, "step": 2660}, {"loss": 1.6415, "grad_norm": 0.525901198387146, "learning_rate": 0.0002, "epoch": 2.2408728493495595, "step": 2670}, {"loss": 1.6891, "grad_norm": 0.4345211684703827, "learning_rate": 0.0002, "epoch": 2.2492656315568613, "step": 2680}, {"loss": 1.5951, "grad_norm": 0.5169841051101685, "learning_rate": 0.0002, "epoch": 2.2576584137641627, "step": 2690}, {"loss": 1.6221, "grad_norm": 0.43511003255844116, "learning_rate": 0.0002, "epoch": 2.2660511959714644, "step": 2700}, {"loss": 1.6084, "grad_norm": 0.4781411588191986, "learning_rate": 0.0002, "epoch": 2.274443978178766, "step": 2710}, {"loss": 1.6292, "grad_norm": 0.4282242953777313, "learning_rate": 0.0002, "epoch": 2.282836760386068, "step": 2720}, {"loss": 1.5238, "grad_norm": 0.4499875605106354, "learning_rate": 0.0002, "epoch": 2.2912295425933698, "step": 2730}, {"loss": 1.5844, "grad_norm": 0.4133218824863434, "learning_rate": 0.0002, "epoch": 2.2996223248006715, "step": 2740}, {"loss": 1.6207, "grad_norm": 0.4706156849861145, "learning_rate": 0.0002, "epoch": 2.3080151070079733, "step": 2750}, {"loss": 1.573, "grad_norm": 0.4537484347820282, "learning_rate": 0.0002, "epoch": 2.3164078892152746, "step": 2760}, {"loss": 1.6556, "grad_norm": 0.39736735820770264, "learning_rate": 0.0002, "epoch": 2.3248006714225764, "step": 2770}, {"loss": 1.7032, "grad_norm": 0.4488453269004822, "learning_rate": 0.0002, "epoch": 2.333193453629878, "step": 2780}, {"loss": 1.6169, "grad_norm": 0.44405487179756165, "learning_rate": 0.0002, "epoch": 2.34158623583718, "step": 2790}, {"loss": 1.5207, "grad_norm": 0.4726555049419403, "learning_rate": 0.0002, "epoch": 2.3499790180444817, "step": 2800}, {"loss": 1.5792, "grad_norm": 0.4820375442504883, "learning_rate": 0.0002, "epoch": 2.3583718002517835, "step": 2810}, {"loss": 1.5774, "grad_norm": 0.46176597476005554, "learning_rate": 0.0002, "epoch": 2.3667645824590853, "step": 2820}, {"loss": 1.6256, "grad_norm": 0.4603394567966461, "learning_rate": 0.0002, "epoch": 2.375157364666387, "step": 2830}, {"loss": 1.6598, "grad_norm": 0.4462946355342865, "learning_rate": 0.0002, "epoch": 2.383550146873689, "step": 2840}, {"loss": 1.5939, "grad_norm": 0.5216080546379089, "learning_rate": 0.0002, "epoch": 2.39194292908099, "step": 2850}, {"loss": 1.5981, "grad_norm": 0.44553086161613464, "learning_rate": 0.0002, "epoch": 2.400335711288292, "step": 2860}, {"loss": 1.6556, "grad_norm": 0.4215725362300873, "learning_rate": 0.0002, "epoch": 2.4087284934955937, "step": 2870}, {"loss": 1.6228, "grad_norm": 0.4646450877189636, "learning_rate": 0.0002, "epoch": 2.4171212757028955, "step": 2880}, {"loss": 1.6547, "grad_norm": 0.44749370217323303, "learning_rate": 0.0002, "epoch": 2.4255140579101973, "step": 2890}, {"loss": 1.6356, "grad_norm": 0.4986693859100342, "learning_rate": 0.0002, "epoch": 2.433906840117499, "step": 2900}, {"loss": 1.6294, "grad_norm": 0.4607609808444977, "learning_rate": 0.0002, "epoch": 2.442299622324801, "step": 2910}, {"loss": 1.6721, "grad_norm": 0.4597654938697815, "learning_rate": 0.0002, "epoch": 2.4506924045321026, "step": 2920}, {"loss": 1.7428, "grad_norm": 0.4106820821762085, "learning_rate": 0.0002, "epoch": 2.4590851867394043, "step": 2930}, {"loss": 1.622, "grad_norm": 0.4531514048576355, "learning_rate": 0.0002, "epoch": 2.4674779689467057, "step": 2940}, {"loss": 1.6367, "grad_norm": 0.4546769857406616, "learning_rate": 0.0002, "epoch": 2.4758707511540075, "step": 2950}, {"loss": 1.6306, "grad_norm": 0.47410622239112854, "learning_rate": 0.0002, "epoch": 2.4842635333613092, "step": 2960}, {"loss": 1.6597, "grad_norm": 0.4498177468776703, "learning_rate": 0.0002, "epoch": 2.492656315568611, "step": 2970}, {"loss": 1.6845, "grad_norm": 0.47267791628837585, "learning_rate": 0.0002, "epoch": 2.5010490977759128, "step": 2980}, {"loss": 1.601, "grad_norm": 0.4340207576751709, "learning_rate": 0.0002, "epoch": 2.5094418799832146, "step": 2990}, {"loss": 1.5783, "grad_norm": 0.43454936146736145, "learning_rate": 0.0002, "epoch": 2.5178346621905163, "step": 3000}, {"loss": 1.5773, "grad_norm": 0.43459394574165344, "learning_rate": 0.0002, "epoch": 2.5262274443978177, "step": 3010}, {"loss": 1.6376, "grad_norm": 0.4716770052909851, "learning_rate": 0.0002, "epoch": 2.5346202266051194, "step": 3020}, {"loss": 1.626, "grad_norm": 0.4339194595813751, "learning_rate": 0.0002, "epoch": 2.543013008812421, "step": 3030}, {"loss": 1.6053, "grad_norm": 0.4655593931674957, "learning_rate": 0.0002, "epoch": 2.551405791019723, "step": 3040}, {"loss": 1.5871, "grad_norm": 0.5480475425720215, "learning_rate": 0.0002, "epoch": 2.5597985732270248, "step": 3050}, {"loss": 1.7056, "grad_norm": 0.4783174991607666, "learning_rate": 0.0002, "epoch": 2.5681913554343265, "step": 3060}, {"loss": 1.5691, "grad_norm": 0.45062026381492615, "learning_rate": 0.0002, "epoch": 2.5765841376416283, "step": 3070}, {"loss": 1.7005, "grad_norm": 0.4559392035007477, "learning_rate": 0.0002, "epoch": 2.58497691984893, "step": 3080}, {"loss": 1.6414, "grad_norm": 0.6581618785858154, "learning_rate": 0.0002, "epoch": 2.593369702056232, "step": 3090}, {"loss": 1.6707, "grad_norm": 0.48549333214759827, "learning_rate": 0.0002, "epoch": 2.601762484263533, "step": 3100}, {"loss": 1.6128, "grad_norm": 0.5358436107635498, "learning_rate": 0.0002, "epoch": 2.610155266470835, "step": 3110}, {"loss": 1.6507, "grad_norm": 0.5380043983459473, "learning_rate": 0.0002, "epoch": 2.6185480486781367, "step": 3120}, {"loss": 1.6394, "grad_norm": 0.49887847900390625, "learning_rate": 0.0002, "epoch": 2.6269408308854385, "step": 3130}, {"loss": 1.6464, "grad_norm": 0.46039602160453796, "learning_rate": 0.0002, "epoch": 2.6353336130927403, "step": 3140}, {"loss": 1.6337, "grad_norm": 0.416098952293396, "learning_rate": 0.0002, "epoch": 2.643726395300042, "step": 3150}, {"loss": 1.6295, "grad_norm": 0.465326726436615, "learning_rate": 0.0002, "epoch": 2.652119177507344, "step": 3160}, {"loss": 1.5806, "grad_norm": 0.47029924392700195, "learning_rate": 0.0002, "epoch": 2.660511959714645, "step": 3170}, {"loss": 1.6268, "grad_norm": 0.5063307285308838, "learning_rate": 0.0002, "epoch": 2.6689047419219474, "step": 3180}, {"loss": 1.5718, "grad_norm": 0.42928868532180786, "learning_rate": 0.0002, "epoch": 2.6772975241292487, "step": 3190}, {"loss": 1.6113, "grad_norm": 0.4170134365558624, "learning_rate": 0.0002, "epoch": 2.6856903063365505, "step": 3200}, {"loss": 1.6337, "grad_norm": 0.47810474038124084, "learning_rate": 0.0002, "epoch": 2.6940830885438523, "step": 3210}, {"loss": 1.6808, "grad_norm": 0.44440609216690063, "learning_rate": 0.0002, "epoch": 2.702475870751154, "step": 3220}, {"loss": 1.5611, "grad_norm": 0.482759565114975, "learning_rate": 0.0002, "epoch": 2.710868652958456, "step": 3230}, {"loss": 1.6265, "grad_norm": 0.4325942099094391, "learning_rate": 0.0002, "epoch": 2.7192614351657576, "step": 3240}, {"loss": 1.585, "grad_norm": 0.502498984336853, "learning_rate": 0.0002, "epoch": 2.7276542173730594, "step": 3250}, {"loss": 1.7179, "grad_norm": 0.4725162982940674, "learning_rate": 0.0002, "epoch": 2.7360469995803607, "step": 3260}, {"loss": 1.6591, "grad_norm": 0.46781349182128906, "learning_rate": 0.0002, "epoch": 2.7444397817876625, "step": 3270}, {"loss": 1.6625, "grad_norm": 0.47366851568222046, "learning_rate": 0.0002, "epoch": 2.7528325639949642, "step": 3280}, {"loss": 1.6437, "grad_norm": 0.5101882815361023, "learning_rate": 0.0002, "epoch": 2.761225346202266, "step": 3290}, {"loss": 1.6488, "grad_norm": 0.4874587059020996, "learning_rate": 0.0002, "epoch": 2.769618128409568, "step": 3300}, {"loss": 1.6151, "grad_norm": 0.4989369213581085, "learning_rate": 0.0002, "epoch": 2.7780109106168696, "step": 3310}, {"loss": 1.6786, "grad_norm": 0.48041442036628723, "learning_rate": 0.0002, "epoch": 2.7864036928241713, "step": 3320}, {"loss": 1.6137, "grad_norm": 0.4845651090145111, "learning_rate": 0.0002, "epoch": 2.7947964750314727, "step": 3330}, {"loss": 1.7154, "grad_norm": 0.48575496673583984, "learning_rate": 0.0002, "epoch": 2.803189257238775, "step": 3340}, {"loss": 1.6771, "grad_norm": 0.509726881980896, "learning_rate": 0.0002, "epoch": 2.811582039446076, "step": 3350}, {"loss": 1.6937, "grad_norm": 0.5026665329933167, "learning_rate": 0.0002, "epoch": 2.819974821653378, "step": 3360}, {"loss": 1.623, "grad_norm": 0.4727601706981659, "learning_rate": 0.0002, "epoch": 2.8283676038606798, "step": 3370}, {"loss": 1.6811, "grad_norm": 0.41952234506607056, "learning_rate": 0.0002, "epoch": 2.8367603860679815, "step": 3380}, {"loss": 1.6639, "grad_norm": 0.49663856625556946, "learning_rate": 0.0002, "epoch": 2.8451531682752833, "step": 3390}, {"loss": 1.6389, "grad_norm": 0.4934511184692383, "learning_rate": 0.0002, "epoch": 2.853545950482585, "step": 3400}, {"loss": 1.6362, "grad_norm": 0.4673226773738861, "learning_rate": 0.0002, "epoch": 2.861938732689887, "step": 3410}, {"loss": 1.641, "grad_norm": 0.48972779512405396, "learning_rate": 0.0002, "epoch": 2.870331514897188, "step": 3420}, {"loss": 1.6047, "grad_norm": 0.5008330345153809, "learning_rate": 0.0002, "epoch": 2.8787242971044904, "step": 3430}, {"loss": 1.6867, "grad_norm": 0.43337664008140564, "learning_rate": 0.0002, "epoch": 2.8871170793117917, "step": 3440}, {"loss": 1.5501, "grad_norm": 0.4430622458457947, "learning_rate": 0.0002, "epoch": 2.8955098615190935, "step": 3450}, {"loss": 1.6415, "grad_norm": 0.45123326778411865, "learning_rate": 0.0002, "epoch": 2.9039026437263953, "step": 3460}, {"loss": 1.5913, "grad_norm": 0.47367340326309204, "learning_rate": 0.0002, "epoch": 2.912295425933697, "step": 3470}, {"loss": 1.5951, "grad_norm": 0.44940701127052307, "learning_rate": 0.0002, "epoch": 2.920688208140999, "step": 3480}, {"loss": 1.6343, "grad_norm": 0.44216281175613403, "learning_rate": 0.0002, "epoch": 2.9290809903483006, "step": 3490}, {"loss": 1.6088, "grad_norm": 0.4824782609939575, "learning_rate": 0.0002, "epoch": 2.9374737725556024, "step": 3500}, {"loss": 1.5949, "grad_norm": 0.43067067861557007, "learning_rate": 0.0002, "epoch": 2.9458665547629037, "step": 3510}, {"loss": 1.547, "grad_norm": 0.46483176946640015, "learning_rate": 0.0002, "epoch": 2.9542593369702055, "step": 3520}, {"loss": 1.5878, "grad_norm": 0.49230799078941345, "learning_rate": 0.0002, "epoch": 2.9626521191775073, "step": 3530}, {"loss": 1.5925, "grad_norm": 0.5081011652946472, "learning_rate": 0.0002, "epoch": 2.971044901384809, "step": 3540}, {"loss": 1.7402, "grad_norm": 0.5326072573661804, "learning_rate": 0.0002, "epoch": 2.979437683592111, "step": 3550}, {"loss": 1.5769, "grad_norm": 0.4981454014778137, "learning_rate": 0.0002, "epoch": 2.9878304657994126, "step": 3560}, {"loss": 1.6073, "grad_norm": 0.4330528676509857, "learning_rate": 0.0002, "epoch": 2.9962232480067144, "step": 3570}, {"eval_loss": 1.824695348739624, "eval_runtime": 37.947, "eval_samples_per_second": 13.572, "eval_steps_per_second": 1.713, "epoch": 2.999580360889635, "step": 3574}, {"loss": 1.5633, "grad_norm": 0.4380604326725006, "learning_rate": 0.0002, "epoch": 3.004616030214016, "step": 3580}, {"loss": 1.4474, "grad_norm": 0.5375564098358154, "learning_rate": 0.0002, "epoch": 3.0130088124213175, "step": 3590}, {"loss": 1.5738, "grad_norm": 0.50722736120224, "learning_rate": 0.0002, "epoch": 3.0214015946286192, "step": 3600}, {"loss": 1.5191, "grad_norm": 0.5398766994476318, "learning_rate": 0.0002, "epoch": 3.029794376835921, "step": 3610}, {"loss": 1.4401, "grad_norm": 0.520709753036499, "learning_rate": 0.0002, "epoch": 3.038187159043223, "step": 3620}, {"loss": 1.5704, "grad_norm": 0.5429664850234985, "learning_rate": 0.0002, "epoch": 3.0465799412505246, "step": 3630}, {"loss": 1.5516, "grad_norm": 0.5634943842887878, "learning_rate": 0.0002, "epoch": 3.0549727234578263, "step": 3640}, {"loss": 1.5349, "grad_norm": 0.5042277574539185, "learning_rate": 0.0002, "epoch": 3.063365505665128, "step": 3650}, {"loss": 1.4708, "grad_norm": 0.5778711438179016, "learning_rate": 0.0002, "epoch": 3.07175828787243, "step": 3660}, {"loss": 1.5196, "grad_norm": 0.5504926443099976, "learning_rate": 0.0002, "epoch": 3.080151070079731, "step": 3670}, {"loss": 1.473, "grad_norm": 0.5199463963508606, "learning_rate": 0.0002, "epoch": 3.088543852287033, "step": 3680}, {"loss": 1.5064, "grad_norm": 0.552334189414978, "learning_rate": 0.0002, "epoch": 3.0969366344943348, "step": 3690}, {"loss": 1.4638, "grad_norm": 0.5650873780250549, "learning_rate": 0.0002, "epoch": 3.1053294167016365, "step": 3700}, {"loss": 1.4945, "grad_norm": 0.6292349696159363, "learning_rate": 0.0002, "epoch": 3.1137221989089383, "step": 3710}, {"loss": 1.4787, "grad_norm": 0.5523604154586792, "learning_rate": 0.0002, "epoch": 3.12211498111624, "step": 3720}, {"loss": 1.4697, "grad_norm": 0.6160100698471069, "learning_rate": 0.0002, "epoch": 3.130507763323542, "step": 3730}, {"loss": 1.5589, "grad_norm": 0.6091629266738892, "learning_rate": 0.0002, "epoch": 3.1389005455308436, "step": 3740}, {"loss": 1.4659, "grad_norm": 0.5695531964302063, "learning_rate": 0.0002, "epoch": 3.1472933277381454, "step": 3750}, {"loss": 1.4605, "grad_norm": 0.569611132144928, "learning_rate": 0.0002, "epoch": 3.1556861099454467, "step": 3760}, {"loss": 1.4592, "grad_norm": 0.5761140584945679, "learning_rate": 0.0002, "epoch": 3.1640788921527485, "step": 3770}, {"loss": 1.4999, "grad_norm": 0.6855548620223999, "learning_rate": 0.0002, "epoch": 3.1724716743600503, "step": 3780}, {"loss": 1.5047, "grad_norm": 0.5815101265907288, "learning_rate": 0.0002, "epoch": 3.180864456567352, "step": 3790}, {"loss": 1.5289, "grad_norm": 0.6179960370063782, "learning_rate": 0.0002, "epoch": 3.189257238774654, "step": 3800}, {"loss": 1.4833, "grad_norm": 0.5418674349784851, "learning_rate": 0.0002, "epoch": 3.1976500209819556, "step": 3810}, {"loss": 1.4994, "grad_norm": 0.5655816197395325, "learning_rate": 0.0002, "epoch": 3.2060428031892574, "step": 3820}, {"loss": 1.5007, "grad_norm": 0.7279291152954102, "learning_rate": 0.0002, "epoch": 3.214435585396559, "step": 3830}, {"loss": 1.5672, "grad_norm": 0.490998238325119, "learning_rate": 0.0002, "epoch": 3.2228283676038605, "step": 3840}, {"loss": 1.4683, "grad_norm": 0.6065797209739685, "learning_rate": 0.0002, "epoch": 3.2312211498111623, "step": 3850}, {"loss": 1.5153, "grad_norm": 0.6024682521820068, "learning_rate": 0.0002, "epoch": 3.239613932018464, "step": 3860}, {"loss": 1.5123, "grad_norm": 0.5571125745773315, "learning_rate": 0.0002, "epoch": 3.248006714225766, "step": 3870}, {"loss": 1.4609, "grad_norm": 0.5662134289741516, "learning_rate": 0.0002, "epoch": 3.2563994964330676, "step": 3880}, {"loss": 1.5452, "grad_norm": 0.5936661958694458, "learning_rate": 0.0002, "epoch": 3.2647922786403694, "step": 3890}, {"loss": 1.5149, "grad_norm": 0.6739671230316162, "learning_rate": 0.0002, "epoch": 3.273185060847671, "step": 3900}, {"loss": 1.5101, "grad_norm": 0.5579532384872437, "learning_rate": 0.0002, "epoch": 3.281577843054973, "step": 3910}, {"loss": 1.4788, "grad_norm": 0.6595954298973083, "learning_rate": 0.0002, "epoch": 3.2899706252622742, "step": 3920}, {"loss": 1.473, "grad_norm": 0.5712262988090515, "learning_rate": 0.0002, "epoch": 3.298363407469576, "step": 3930}, {"loss": 1.5512, "grad_norm": 0.5601761341094971, "learning_rate": 0.0002, "epoch": 3.306756189676878, "step": 3940}, {"loss": 1.4904, "grad_norm": 0.5759967565536499, "learning_rate": 0.0002, "epoch": 3.3151489718841796, "step": 3950}, {"loss": 1.4885, "grad_norm": 0.6543047428131104, "learning_rate": 0.0002, "epoch": 3.3235417540914813, "step": 3960}, {"loss": 1.5063, "grad_norm": 0.6355253458023071, "learning_rate": 0.0002, "epoch": 3.331934536298783, "step": 3970}, {"loss": 1.5025, "grad_norm": 0.5671007633209229, "learning_rate": 0.0002, "epoch": 3.340327318506085, "step": 3980}, {"loss": 1.5049, "grad_norm": 0.6743636727333069, "learning_rate": 0.0002, "epoch": 3.3487201007133867, "step": 3990}, {"loss": 1.5527, "grad_norm": 0.500627338886261, "learning_rate": 0.0002, "epoch": 3.3571128829206884, "step": 4000}, {"loss": 1.4884, "grad_norm": 0.5666340589523315, "learning_rate": 0.0002, "epoch": 3.3655056651279898, "step": 4010}, {"loss": 1.5104, "grad_norm": 0.5651408433914185, "learning_rate": 0.0002, "epoch": 3.3738984473352915, "step": 4020}, {"loss": 1.4907, "grad_norm": 0.6338897943496704, "learning_rate": 0.0002, "epoch": 3.3822912295425933, "step": 4030}, {"loss": 1.553, "grad_norm": 0.5781935453414917, "learning_rate": 0.0002, "epoch": 3.390684011749895, "step": 4040}, {"loss": 1.5535, "grad_norm": 0.55543053150177, "learning_rate": 0.0002, "epoch": 3.399076793957197, "step": 4050}, {"loss": 1.4884, "grad_norm": 0.6602614521980286, "learning_rate": 0.0002, "epoch": 3.4074695761644986, "step": 4060}, {"loss": 1.471, "grad_norm": 0.5514156222343445, "learning_rate": 0.0002, "epoch": 3.4158623583718004, "step": 4070}, {"loss": 1.4634, "grad_norm": 0.5760560035705566, "learning_rate": 0.0002, "epoch": 3.4242551405791017, "step": 4080}, {"loss": 1.4662, "grad_norm": 0.657503604888916, "learning_rate": 0.0002, "epoch": 3.4326479227864035, "step": 4090}, {"loss": 1.5041, "grad_norm": 0.5746736526489258, "learning_rate": 0.0002, "epoch": 3.4410407049937053, "step": 4100}, {"loss": 1.4387, "grad_norm": 0.5988999009132385, "learning_rate": 0.0002, "epoch": 3.449433487201007, "step": 4110}, {"loss": 1.5475, "grad_norm": 0.7294586300849915, "learning_rate": 0.0002, "epoch": 3.457826269408309, "step": 4120}, {"loss": 1.4878, "grad_norm": 0.6391161680221558, "learning_rate": 0.0002, "epoch": 3.4662190516156106, "step": 4130}, {"loss": 1.5366, "grad_norm": 0.6416470408439636, "learning_rate": 0.0002, "epoch": 3.4746118338229124, "step": 4140}, {"loss": 1.5587, "grad_norm": 0.5710626244544983, "learning_rate": 0.0002, "epoch": 3.483004616030214, "step": 4150}, {"loss": 1.4661, "grad_norm": 0.5370054841041565, "learning_rate": 0.0002, "epoch": 3.491397398237516, "step": 4160}, {"loss": 1.5167, "grad_norm": 0.5559558272361755, "learning_rate": 0.0002, "epoch": 3.4997901804448173, "step": 4170}, {"loss": 1.4244, "grad_norm": 0.5426168441772461, "learning_rate": 0.0002, "epoch": 3.508182962652119, "step": 4180}, {"loss": 1.5241, "grad_norm": 0.5997438430786133, "learning_rate": 0.0002, "epoch": 3.516575744859421, "step": 4190}, {"loss": 1.6091, "grad_norm": 0.5399143099784851, "learning_rate": 0.0002, "epoch": 3.5249685270667226, "step": 4200}, {"loss": 1.5066, "grad_norm": 0.6341416239738464, "learning_rate": 0.0002, "epoch": 3.5333613092740244, "step": 4210}, {"loss": 1.5436, "grad_norm": 0.632238507270813, "learning_rate": 0.0002, "epoch": 3.541754091481326, "step": 4220}, {"loss": 1.5423, "grad_norm": 0.6356478333473206, "learning_rate": 0.0002, "epoch": 3.550146873688628, "step": 4230}, {"loss": 1.483, "grad_norm": 0.6379408240318298, "learning_rate": 0.0002, "epoch": 3.5585396558959292, "step": 4240}, {"loss": 1.5184, "grad_norm": 0.6265586018562317, "learning_rate": 0.0002, "epoch": 3.5669324381032315, "step": 4250}, {"loss": 1.5047, "grad_norm": 0.5378820896148682, "learning_rate": 0.0002, "epoch": 3.575325220310533, "step": 4260}, {"loss": 1.5668, "grad_norm": 0.6800801753997803, "learning_rate": 0.0002, "epoch": 3.5837180025178346, "step": 4270}, {"loss": 1.5363, "grad_norm": 0.5653113126754761, "learning_rate": 0.0002, "epoch": 3.5921107847251363, "step": 4280}, {"loss": 1.5007, "grad_norm": 0.548647940158844, "learning_rate": 0.0002, "epoch": 3.600503566932438, "step": 4290}, {"loss": 1.5034, "grad_norm": 0.5729944705963135, "learning_rate": 0.0002, "epoch": 3.60889634913974, "step": 4300}, {"loss": 1.575, "grad_norm": 0.6204999685287476, "learning_rate": 0.0002, "epoch": 3.6172891313470417, "step": 4310}, {"loss": 1.5107, "grad_norm": 0.6275812983512878, "learning_rate": 0.0002, "epoch": 3.6256819135543434, "step": 4320}, {"loss": 1.5013, "grad_norm": 0.7261835336685181, "learning_rate": 0.0002, "epoch": 3.6340746957616448, "step": 4330}, {"loss": 1.5128, "grad_norm": 0.6048004627227783, "learning_rate": 0.0002, "epoch": 3.6424674779689465, "step": 4340}, {"loss": 1.5106, "grad_norm": 0.5879671573638916, "learning_rate": 0.0002, "epoch": 3.6508602601762483, "step": 4350}, {"loss": 1.5477, "grad_norm": 0.6001018285751343, "learning_rate": 0.0002, "epoch": 3.65925304238355, "step": 4360}, {"loss": 1.5247, "grad_norm": 0.6468151211738586, "learning_rate": 0.0002, "epoch": 3.667645824590852, "step": 4370}, {"loss": 1.563, "grad_norm": 0.6342051029205322, "learning_rate": 0.0002, "epoch": 3.6760386067981536, "step": 4380}, {"loss": 1.5444, "grad_norm": 0.6078384518623352, "learning_rate": 0.0002, "epoch": 3.6844313890054554, "step": 4390}, {"loss": 1.5546, "grad_norm": 0.5555588006973267, "learning_rate": 0.0002, "epoch": 3.692824171212757, "step": 4400}, {"loss": 1.5694, "grad_norm": 0.6089665293693542, "learning_rate": 0.0002, "epoch": 3.701216953420059, "step": 4410}, {"loss": 1.5898, "grad_norm": 0.6225191950798035, "learning_rate": 0.0002, "epoch": 3.7096097356273603, "step": 4420}, {"loss": 1.5153, "grad_norm": 0.5642715692520142, "learning_rate": 0.0002, "epoch": 3.718002517834662, "step": 4430}, {"loss": 1.5057, "grad_norm": 0.5703449845314026, "learning_rate": 0.0002, "epoch": 3.726395300041964, "step": 4440}, {"loss": 1.5451, "grad_norm": 0.6029745936393738, "learning_rate": 0.0002, "epoch": 3.7347880822492656, "step": 4450}, {"loss": 1.5044, "grad_norm": 0.7089189887046814, "learning_rate": 0.0002, "epoch": 3.7431808644565674, "step": 4460}, {"loss": 1.4804, "grad_norm": 0.6230936050415039, "learning_rate": 0.0002, "epoch": 3.751573646663869, "step": 4470}, {"loss": 1.567, "grad_norm": 0.5718494653701782, "learning_rate": 0.0002, "epoch": 3.759966428871171, "step": 4480}, {"loss": 1.5612, "grad_norm": 0.5404117703437805, "learning_rate": 0.0002, "epoch": 3.7683592110784723, "step": 4490}, {"loss": 1.4707, "grad_norm": 0.5816529393196106, "learning_rate": 0.0002, "epoch": 3.7767519932857745, "step": 4500}, {"loss": 1.5802, "grad_norm": 0.6314901113510132, "learning_rate": 0.0002, "epoch": 3.785144775493076, "step": 4510}, {"loss": 1.5445, "grad_norm": 0.7639698386192322, "learning_rate": 0.0002, "epoch": 3.7935375577003776, "step": 4520}, {"loss": 1.5718, "grad_norm": 0.5727366209030151, "learning_rate": 0.0002, "epoch": 3.8019303399076794, "step": 4530}, {"loss": 1.5409, "grad_norm": 0.6467128396034241, "learning_rate": 0.0002, "epoch": 3.810323122114981, "step": 4540}, {"loss": 1.5266, "grad_norm": 0.6572837233543396, "learning_rate": 0.0002, "epoch": 3.818715904322283, "step": 4550}, {"loss": 1.5718, "grad_norm": 0.5847418904304504, "learning_rate": 0.0002, "epoch": 3.8271086865295847, "step": 4560}, {"loss": 1.5303, "grad_norm": 0.48820871114730835, "learning_rate": 0.0002, "epoch": 3.8355014687368865, "step": 4570}, {"loss": 1.4911, "grad_norm": 1.2537429332733154, "learning_rate": 0.0002, "epoch": 3.843894250944188, "step": 4580}, {"loss": 1.5522, "grad_norm": 0.6026989221572876, "learning_rate": 0.0002, "epoch": 3.8522870331514896, "step": 4590}, {"loss": 1.5035, "grad_norm": 0.5541417598724365, "learning_rate": 0.0002, "epoch": 3.8606798153587913, "step": 4600}, {"loss": 1.5238, "grad_norm": 0.7668771147727966, "learning_rate": 0.0002, "epoch": 3.869072597566093, "step": 4610}, {"loss": 1.5428, "grad_norm": 0.6181227564811707, "learning_rate": 0.0002, "epoch": 3.877465379773395, "step": 4620}, {"loss": 1.5242, "grad_norm": 0.5842700004577637, "learning_rate": 0.0002, "epoch": 3.8858581619806967, "step": 4630}, {"loss": 1.5501, "grad_norm": 0.5824751257896423, "learning_rate": 0.0002, "epoch": 3.8942509441879984, "step": 4640}, {"loss": 1.4443, "grad_norm": 0.6212735772132874, "learning_rate": 0.0002, "epoch": 3.9026437263952998, "step": 4650}, {"loss": 1.4972, "grad_norm": 0.6123346090316772, "learning_rate": 0.0002, "epoch": 3.911036508602602, "step": 4660}, {"loss": 1.5531, "grad_norm": 0.518662691116333, "learning_rate": 0.0002, "epoch": 3.9194292908099033, "step": 4670}, {"loss": 1.5151, "grad_norm": 0.6963476538658142, "learning_rate": 0.0002, "epoch": 3.927822073017205, "step": 4680}, {"loss": 1.5826, "grad_norm": 0.5192152261734009, "learning_rate": 0.0002, "epoch": 3.936214855224507, "step": 4690}, {"loss": 1.5312, "grad_norm": 0.5820888876914978, "learning_rate": 0.0002, "epoch": 3.9446076374318086, "step": 4700}, {"loss": 1.527, "grad_norm": 0.6320387721061707, "learning_rate": 0.0002, "epoch": 3.9530004196391104, "step": 4710}, {"loss": 1.6006, "grad_norm": 0.6174548268318176, "learning_rate": 0.0002, "epoch": 3.961393201846412, "step": 4720}, {"loss": 1.5581, "grad_norm": 0.6691966652870178, "learning_rate": 0.0002, "epoch": 3.969785984053714, "step": 4730}, {"loss": 1.4762, "grad_norm": 0.5972068309783936, "learning_rate": 0.0002, "epoch": 3.9781787662610153, "step": 4740}, {"loss": 1.4947, "grad_norm": 0.5759536027908325, "learning_rate": 0.0002, "epoch": 3.9865715484683175, "step": 4750}, {"loss": 1.4836, "grad_norm": 0.5886756777763367, "learning_rate": 0.0002, "epoch": 3.994964330675619, "step": 4760}, {"eval_loss": 1.8749940395355225, "eval_runtime": 38.037, "eval_samples_per_second": 13.539, "eval_steps_per_second": 1.709, "epoch": 4.0, "step": 4766}, {"loss": 1.5259, "grad_norm": 0.5915011167526245, "learning_rate": 0.0002, "epoch": 4.003357112882921, "step": 4770}, {"loss": 1.4071, "grad_norm": 0.8565000891685486, "learning_rate": 0.0002, "epoch": 4.011749895090222, "step": 4780}, {"loss": 1.3211, "grad_norm": 0.7753950953483582, "learning_rate": 0.0002, "epoch": 4.020142677297524, "step": 4790}, {"loss": 1.3607, "grad_norm": 0.6837254166603088, "learning_rate": 0.0002, "epoch": 4.028535459504826, "step": 4800}, {"loss": 1.3275, "grad_norm": 0.8374526500701904, "learning_rate": 0.0002, "epoch": 4.036928241712127, "step": 4810}, {"loss": 1.3579, "grad_norm": 0.8717963099479675, "learning_rate": 0.0002, "epoch": 4.0453210239194295, "step": 4820}, {"loss": 1.3374, "grad_norm": 0.7002043724060059, "learning_rate": 0.0002, "epoch": 4.053713806126731, "step": 4830}, {"loss": 1.3882, "grad_norm": 1.0319572687149048, "learning_rate": 0.0002, "epoch": 4.062106588334033, "step": 4840}, {"loss": 1.3291, "grad_norm": 0.6746882200241089, "learning_rate": 0.0002, "epoch": 4.070499370541334, "step": 4850}, {"loss": 1.339, "grad_norm": 0.8187578320503235, "learning_rate": 0.0002, "epoch": 4.078892152748637, "step": 4860}, {"loss": 1.368, "grad_norm": 0.7888399362564087, "learning_rate": 0.0002, "epoch": 4.087284934955938, "step": 4870}, {"loss": 1.4115, "grad_norm": 0.7149351239204407, "learning_rate": 0.0002, "epoch": 4.095677717163239, "step": 4880}, {"loss": 1.341, "grad_norm": 0.9067983031272888, "learning_rate": 0.0002, "epoch": 4.1040704993705415, "step": 4890}, {"loss": 1.4084, "grad_norm": 0.771186351776123, "learning_rate": 0.0002, "epoch": 4.112463281577843, "step": 4900}, {"loss": 1.2722, "grad_norm": 0.7756485342979431, "learning_rate": 0.0002, "epoch": 4.120856063785145, "step": 4910}, {"loss": 1.4138, "grad_norm": 0.7149116396903992, "learning_rate": 0.0002, "epoch": 4.129248845992446, "step": 4920}, {"loss": 1.3102, "grad_norm": 0.700442910194397, "learning_rate": 0.0002, "epoch": 4.137641628199749, "step": 4930}, {"loss": 1.3628, "grad_norm": 0.8439189195632935, "learning_rate": 0.0002, "epoch": 4.14603441040705, "step": 4940}, {"loss": 1.3511, "grad_norm": 0.6570779085159302, "learning_rate": 0.0002, "epoch": 4.154427192614351, "step": 4950}, {"loss": 1.3955, "grad_norm": 0.886482298374176, "learning_rate": 0.0002, "epoch": 4.1628199748216534, "step": 4960}, {"loss": 1.4083, "grad_norm": 0.7220938801765442, "learning_rate": 0.0002, "epoch": 4.171212757028955, "step": 4970}, {"loss": 1.3611, "grad_norm": 0.7185905575752258, "learning_rate": 0.0002, "epoch": 4.179605539236257, "step": 4980}, {"loss": 1.3623, "grad_norm": 0.7566333413124084, "learning_rate": 0.0002, "epoch": 4.187998321443558, "step": 4990}, {"loss": 1.2771, "grad_norm": 0.6960445642471313, "learning_rate": 0.0002, "epoch": 4.1963911036508605, "step": 5000}, {"loss": 1.3565, "grad_norm": 0.7727336883544922, "learning_rate": 0.0002, "epoch": 4.204783885858162, "step": 5010}, {"loss": 1.4156, "grad_norm": 0.8038365244865417, "learning_rate": 0.0002, "epoch": 4.213176668065464, "step": 5020}, {"loss": 1.3849, "grad_norm": 0.7587628364562988, "learning_rate": 0.0002, "epoch": 4.221569450272765, "step": 5030}, {"loss": 1.4047, "grad_norm": 0.928032398223877, "learning_rate": 0.0002, "epoch": 4.229962232480067, "step": 5040}, {"loss": 1.3768, "grad_norm": 0.7168642282485962, "learning_rate": 0.0002, "epoch": 4.238355014687369, "step": 5050}, {"loss": 1.3767, "grad_norm": 0.7981422543525696, "learning_rate": 0.0002, "epoch": 4.24674779689467, "step": 5060}, {"loss": 1.406, "grad_norm": 0.6951150894165039, "learning_rate": 0.0002, "epoch": 4.2551405791019725, "step": 5070}, {"loss": 1.3776, "grad_norm": 0.7337371706962585, "learning_rate": 0.0002, "epoch": 4.263533361309274, "step": 5080}, {"loss": 1.3425, "grad_norm": 0.8367464542388916, "learning_rate": 0.0002, "epoch": 4.271926143516576, "step": 5090}, {"loss": 1.3823, "grad_norm": 0.6744083166122437, "learning_rate": 0.0002, "epoch": 4.280318925723877, "step": 5100}, {"loss": 1.4183, "grad_norm": 0.9072301387786865, "learning_rate": 0.0002, "epoch": 4.28871170793118, "step": 5110}, {"loss": 1.4219, "grad_norm": 0.7703930735588074, "learning_rate": 0.0002, "epoch": 4.297104490138481, "step": 5120}, {"loss": 1.3658, "grad_norm": 0.6734083294868469, "learning_rate": 0.0002, "epoch": 4.305497272345782, "step": 5130}, {"loss": 1.441, "grad_norm": 0.7835540175437927, "learning_rate": 0.0002, "epoch": 4.3138900545530845, "step": 5140}, {"loss": 1.384, "grad_norm": 1.0822200775146484, "learning_rate": 0.0002, "epoch": 4.322282836760386, "step": 5150}, {"loss": 1.4167, "grad_norm": 0.8432536721229553, "learning_rate": 0.0002, "epoch": 4.330675618967688, "step": 5160}, {"loss": 1.3796, "grad_norm": 0.6739283800125122, "learning_rate": 0.0002, "epoch": 4.339068401174989, "step": 5170}, {"loss": 1.3651, "grad_norm": 0.7395278811454773, "learning_rate": 0.0002, "epoch": 4.347461183382292, "step": 5180}, {"loss": 1.3258, "grad_norm": 0.7638891339302063, "learning_rate": 0.0002, "epoch": 4.355853965589593, "step": 5190}, {"loss": 1.34, "grad_norm": 1.1222662925720215, "learning_rate": 0.0002, "epoch": 4.364246747796894, "step": 5200}, {"loss": 1.3757, "grad_norm": 0.9102525115013123, "learning_rate": 0.0002, "epoch": 4.3726395300041965, "step": 5210}, {"loss": 1.413, "grad_norm": 0.7181593775749207, "learning_rate": 0.0002, "epoch": 4.381032312211498, "step": 5220}, {"loss": 1.3808, "grad_norm": 0.7813979387283325, "learning_rate": 0.0002, "epoch": 4.3894250944188, "step": 5230}, {"loss": 1.423, "grad_norm": 0.8906185626983643, "learning_rate": 0.0002, "epoch": 4.397817876626101, "step": 5240}, {"loss": 1.3901, "grad_norm": 0.7456443309783936, "learning_rate": 0.0002, "epoch": 4.406210658833404, "step": 5250}, {"loss": 1.3292, "grad_norm": 0.8752070069313049, "learning_rate": 0.0002, "epoch": 4.414603441040705, "step": 5260}, {"loss": 1.3351, "grad_norm": 0.9560954570770264, "learning_rate": 0.0002, "epoch": 4.422996223248007, "step": 5270}, {"loss": 1.3708, "grad_norm": 0.7227762341499329, "learning_rate": 0.0002, "epoch": 4.4313890054553084, "step": 5280}, {"loss": 1.4281, "grad_norm": 0.8141599893569946, "learning_rate": 0.0002, "epoch": 4.43978178766261, "step": 5290}, {"loss": 1.381, "grad_norm": 0.928382158279419, "learning_rate": 0.0002, "epoch": 4.448174569869912, "step": 5300}, {"loss": 1.3586, "grad_norm": 0.7719997763633728, "learning_rate": 0.0002, "epoch": 4.456567352077213, "step": 5310}, {"loss": 1.3652, "grad_norm": 0.8081879615783691, "learning_rate": 0.0002, "epoch": 4.4649601342845155, "step": 5320}, {"loss": 1.4121, "grad_norm": 0.7903412580490112, "learning_rate": 0.0002, "epoch": 4.473352916491817, "step": 5330}, {"loss": 1.4453, "grad_norm": 0.7751287221908569, "learning_rate": 0.0002, "epoch": 4.481745698699119, "step": 5340}, {"loss": 1.392, "grad_norm": 0.8287544250488281, "learning_rate": 0.0002, "epoch": 4.49013848090642, "step": 5350}, {"loss": 1.3841, "grad_norm": 0.7431012392044067, "learning_rate": 0.0002, "epoch": 4.498531263113723, "step": 5360}, {"loss": 1.3843, "grad_norm": 0.8648661971092224, "learning_rate": 0.0002, "epoch": 4.506924045321024, "step": 5370}, {"loss": 1.3742, "grad_norm": 0.9314997792243958, "learning_rate": 0.0002, "epoch": 4.515316827528325, "step": 5380}, {"loss": 1.354, "grad_norm": 0.7530864477157593, "learning_rate": 0.0002, "epoch": 4.5237096097356275, "step": 5390}, {"loss": 1.4159, "grad_norm": 0.8739821910858154, "learning_rate": 0.0002, "epoch": 4.532102391942929, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.8090344667434692, "learning_rate": 0.0002, "epoch": 4.540495174150231, "step": 5410}, {"loss": 1.4187, "grad_norm": 0.7530879974365234, "learning_rate": 0.0002, "epoch": 4.548887956357532, "step": 5420}, {"loss": 1.47, "grad_norm": 0.8787251114845276, "learning_rate": 0.0002, "epoch": 4.557280738564835, "step": 5430}, {"loss": 1.375, "grad_norm": 0.813961923122406, "learning_rate": 0.0002, "epoch": 4.565673520772136, "step": 5440}, {"loss": 1.4475, "grad_norm": 0.7778232097625732, "learning_rate": 0.0002, "epoch": 4.574066302979437, "step": 5450}, {"loss": 1.4421, "grad_norm": 0.7323020696640015, "learning_rate": 0.0002, "epoch": 4.5824590851867395, "step": 5460}, {"loss": 1.396, "grad_norm": 0.7826765179634094, "learning_rate": 0.0002, "epoch": 4.590851867394041, "step": 5470}, {"loss": 1.4068, "grad_norm": 0.7245969772338867, "learning_rate": 0.0002, "epoch": 4.599244649601343, "step": 5480}, {"loss": 1.4276, "grad_norm": 0.7697308659553528, "learning_rate": 0.0002, "epoch": 4.607637431808644, "step": 5490}, {"loss": 1.3849, "grad_norm": 0.8053571581840515, "learning_rate": 0.0002, "epoch": 4.616030214015947, "step": 5500}, {"loss": 1.4225, "grad_norm": 0.6728386282920837, "learning_rate": 0.0002, "epoch": 4.624422996223248, "step": 5510}, {"loss": 1.3771, "grad_norm": 0.7398585677146912, "learning_rate": 0.0002, "epoch": 4.632815778430549, "step": 5520}, {"loss": 1.4216, "grad_norm": 0.7896319031715393, "learning_rate": 0.0002, "epoch": 4.6412085606378515, "step": 5530}, {"loss": 1.4199, "grad_norm": 0.8290980458259583, "learning_rate": 0.0002, "epoch": 4.649601342845153, "step": 5540}, {"loss": 1.463, "grad_norm": 0.8232647776603699, "learning_rate": 0.0002, "epoch": 4.657994125052455, "step": 5550}, {"loss": 1.3925, "grad_norm": 0.9154987335205078, "learning_rate": 0.0002, "epoch": 4.666386907259756, "step": 5560}, {"loss": 1.3674, "grad_norm": 0.8400886654853821, "learning_rate": 0.0002, "epoch": 4.674779689467059, "step": 5570}, {"loss": 1.379, "grad_norm": 0.7312718629837036, "learning_rate": 0.0002, "epoch": 4.68317247167436, "step": 5580}, {"loss": 1.3925, "grad_norm": 0.8043803572654724, "learning_rate": 0.0002, "epoch": 4.691565253881662, "step": 5590}, {"loss": 1.3952, "grad_norm": 0.7966225147247314, "learning_rate": 0.0002, "epoch": 4.6999580360889635, "step": 5600}, {"loss": 1.3429, "grad_norm": 0.881574809551239, "learning_rate": 0.0002, "epoch": 4.708350818296266, "step": 5610}, {"loss": 1.4444, "grad_norm": 0.7252084016799927, "learning_rate": 0.0002, "epoch": 4.716743600503567, "step": 5620}, {"loss": 1.3566, "grad_norm": 0.7726518511772156, "learning_rate": 0.0002, "epoch": 4.725136382710868, "step": 5630}, {"loss": 1.3954, "grad_norm": 0.7306379079818726, "learning_rate": 0.0002, "epoch": 4.7335291649181706, "step": 5640}, {"loss": 1.4385, "grad_norm": 0.8029969334602356, "learning_rate": 0.0002, "epoch": 4.741921947125472, "step": 5650}, {"loss": 1.3966, "grad_norm": 0.9103893637657166, "learning_rate": 0.0002, "epoch": 4.750314729332774, "step": 5660}, {"loss": 1.4026, "grad_norm": 0.8783416748046875, "learning_rate": 0.0002, "epoch": 4.758707511540075, "step": 5670}, {"loss": 1.3427, "grad_norm": 0.6807119846343994, "learning_rate": 0.0002, "epoch": 4.767100293747378, "step": 5680}, {"loss": 1.4148, "grad_norm": 0.7103772759437561, "learning_rate": 0.0002, "epoch": 4.775493075954679, "step": 5690}, {"loss": 1.4079, "grad_norm": 0.8472093343734741, "learning_rate": 0.0002, "epoch": 4.78388585816198, "step": 5700}, {"loss": 1.3937, "grad_norm": 0.851847231388092, "learning_rate": 0.0002, "epoch": 4.7922786403692825, "step": 5710}, {"loss": 1.3965, "grad_norm": 0.9084636569023132, "learning_rate": 0.0002, "epoch": 4.800671422576584, "step": 5720}, {"loss": 1.4358, "grad_norm": 0.7628585696220398, "learning_rate": 0.0002, "epoch": 4.809064204783886, "step": 5730}, {"loss": 1.3746, "grad_norm": 0.775580883026123, "learning_rate": 0.0002, "epoch": 4.817456986991187, "step": 5740}, {"loss": 1.4573, "grad_norm": 0.7855771780014038, "learning_rate": 0.0002, "epoch": 4.82584976919849, "step": 5750}, {"loss": 1.3991, "grad_norm": 0.7021728754043579, "learning_rate": 0.0002, "epoch": 4.834242551405791, "step": 5760}, {"loss": 1.4012, "grad_norm": 0.7810541391372681, "learning_rate": 0.0002, "epoch": 4.842635333613092, "step": 5770}, {"loss": 1.396, "grad_norm": 0.7290041446685791, "learning_rate": 0.0002, "epoch": 4.8510281158203945, "step": 5780}, {"loss": 1.4769, "grad_norm": 0.9059709906578064, "learning_rate": 0.0002, "epoch": 4.859420898027696, "step": 5790}, {"loss": 1.4091, "grad_norm": 0.8338062167167664, "learning_rate": 0.0002, "epoch": 4.867813680234998, "step": 5800}, {"loss": 1.395, "grad_norm": 0.830926775932312, "learning_rate": 0.0002, "epoch": 4.876206462442299, "step": 5810}, {"loss": 1.4261, "grad_norm": 0.7818633317947388, "learning_rate": 0.0002, "epoch": 4.884599244649602, "step": 5820}, {"loss": 1.4252, "grad_norm": 0.8143376708030701, "learning_rate": 0.0002, "epoch": 4.892992026856903, "step": 5830}, {"loss": 1.3583, "grad_norm": 0.7754496335983276, "learning_rate": 0.0002, "epoch": 4.901384809064205, "step": 5840}, {"loss": 1.4036, "grad_norm": 0.7154468297958374, "learning_rate": 0.0002, "epoch": 4.9097775912715065, "step": 5850}, {"loss": 1.3909, "grad_norm": 0.6829783916473389, "learning_rate": 0.0002, "epoch": 4.918170373478809, "step": 5860}, {"loss": 1.3854, "grad_norm": 0.784919261932373, "learning_rate": 0.0002, "epoch": 4.92656315568611, "step": 5870}, {"loss": 1.4277, "grad_norm": 0.8168354034423828, "learning_rate": 0.0002, "epoch": 4.934955937893411, "step": 5880}, {"loss": 1.3694, "grad_norm": 0.7356618642807007, "learning_rate": 0.0002, "epoch": 4.943348720100714, "step": 5890}, {"loss": 1.4827, "grad_norm": 0.7399224042892456, "learning_rate": 0.0002, "epoch": 4.951741502308015, "step": 5900}, {"loss": 1.3643, "grad_norm": 0.7430436015129089, "learning_rate": 0.0002, "epoch": 4.960134284515317, "step": 5910}, {"loss": 1.3836, "grad_norm": 0.7587705850601196, "learning_rate": 0.0002, "epoch": 4.9685270667226185, "step": 5920}, {"loss": 1.4162, "grad_norm": 0.9103638529777527, "learning_rate": 0.0002, "epoch": 4.976919848929921, "step": 5930}, {"loss": 1.4688, "grad_norm": 0.7357394695281982, "learning_rate": 0.0002, "epoch": 4.985312631137222, "step": 5940}, {"loss": 1.3988, "grad_norm": 0.7371547222137451, "learning_rate": 0.0002, "epoch": 4.993705413344523, "step": 5950}, {"eval_loss": 1.9367210865020752, "eval_runtime": 37.9833, "eval_samples_per_second": 13.559, "eval_steps_per_second": 1.711, "epoch": 4.9995803608896345, "step": 5957}, {"loss": 1.3876, "grad_norm": 0.7783351540565491, "learning_rate": 0.0002, "epoch": 5.0020981955518256, "step": 5960}, {"loss": 1.2387, "grad_norm": 0.9268898367881775, "learning_rate": 0.0002, "epoch": 5.010490977759127, "step": 5970}, {"loss": 1.2621, "grad_norm": 0.9562761783599854, "learning_rate": 0.0002, "epoch": 5.018883759966429, "step": 5980}, {"loss": 1.205, "grad_norm": 0.9391738176345825, "learning_rate": 0.0002, "epoch": 5.02727654217373, "step": 5990}, {"loss": 1.2112, "grad_norm": 0.850326418876648, "learning_rate": 0.0002, "epoch": 5.035669324381033, "step": 6000}, {"loss": 1.2285, "grad_norm": 0.8442679643630981, "learning_rate": 0.0002, "epoch": 5.044062106588334, "step": 6010}, {"loss": 1.1677, "grad_norm": 1.2147290706634521, "learning_rate": 0.0002, "epoch": 5.052454888795635, "step": 6020}, {"loss": 1.1836, "grad_norm": 0.9732922315597534, "learning_rate": 0.0002, "epoch": 5.0608476710029375, "step": 6030}, {"loss": 1.215, "grad_norm": 0.9354516267776489, "learning_rate": 0.0002, "epoch": 5.069240453210239, "step": 6040}, {"loss": 1.1918, "grad_norm": 0.9681560397148132, "learning_rate": 0.0002, "epoch": 5.077633235417541, "step": 6050}, {"loss": 1.2146, "grad_norm": 0.9500439763069153, "learning_rate": 0.0002, "epoch": 5.086026017624842, "step": 6060}, {"loss": 1.1475, "grad_norm": 0.8693879246711731, "learning_rate": 0.0002, "epoch": 5.094418799832145, "step": 6070}, {"loss": 1.2181, "grad_norm": 1.1066458225250244, "learning_rate": 0.0002, "epoch": 5.102811582039446, "step": 6080}, {"loss": 1.2135, "grad_norm": 0.9530285000801086, "learning_rate": 0.0002, "epoch": 5.111204364246748, "step": 6090}, {"loss": 1.2388, "grad_norm": 0.9323630928993225, "learning_rate": 0.0002, "epoch": 5.1195971464540495, "step": 6100}, {"loss": 1.2434, "grad_norm": 0.9040294885635376, "learning_rate": 0.0002, "epoch": 5.127989928661351, "step": 6110}, {"loss": 1.2502, "grad_norm": 0.9981122612953186, "learning_rate": 0.0002, "epoch": 5.136382710868653, "step": 6120}, {"loss": 1.2648, "grad_norm": 0.9070921540260315, "learning_rate": 0.0002, "epoch": 5.144775493075954, "step": 6130}, {"loss": 1.2802, "grad_norm": 1.043802261352539, "learning_rate": 0.0002, "epoch": 5.153168275283257, "step": 6140}, {"loss": 1.1865, "grad_norm": 1.0889761447906494, "learning_rate": 0.0002, "epoch": 5.161561057490558, "step": 6150}, {"loss": 1.2498, "grad_norm": 0.9908999800682068, "learning_rate": 0.0002, "epoch": 5.16995383969786, "step": 6160}, {"loss": 1.2981, "grad_norm": 1.099233865737915, "learning_rate": 0.0002, "epoch": 5.1783466219051615, "step": 6170}, {"loss": 1.2236, "grad_norm": 0.9536478519439697, "learning_rate": 0.0002, "epoch": 5.186739404112464, "step": 6180}, {"loss": 1.1889, "grad_norm": 0.8672952055931091, "learning_rate": 0.0002, "epoch": 5.195132186319765, "step": 6190}, {"loss": 1.2142, "grad_norm": 1.0116329193115234, "learning_rate": 0.0002, "epoch": 5.203524968527066, "step": 6200}, {"loss": 1.1813, "grad_norm": 0.9327153563499451, "learning_rate": 0.0002, "epoch": 5.211917750734369, "step": 6210}, {"loss": 1.2372, "grad_norm": 0.85637366771698, "learning_rate": 0.0002, "epoch": 5.22031053294167, "step": 6220}, {"loss": 1.2949, "grad_norm": 1.0490736961364746, "learning_rate": 0.0002, "epoch": 5.228703315148972, "step": 6230}, {"loss": 1.1604, "grad_norm": 0.8849565982818604, "learning_rate": 0.0002, "epoch": 5.2370960973562735, "step": 6240}, {"loss": 1.2257, "grad_norm": 0.8852671980857849, "learning_rate": 0.0002, "epoch": 5.245488879563576, "step": 6250}, {"loss": 1.275, "grad_norm": 0.9146860241889954, "learning_rate": 0.0002, "epoch": 5.253881661770877, "step": 6260}, {"loss": 1.2543, "grad_norm": 1.0188325643539429, "learning_rate": 0.0002, "epoch": 5.262274443978178, "step": 6270}, {"loss": 1.1703, "grad_norm": 1.0053156614303589, "learning_rate": 0.0002, "epoch": 5.270667226185481, "step": 6280}, {"loss": 1.2594, "grad_norm": 0.9962273836135864, "learning_rate": 0.0002, "epoch": 5.279060008392782, "step": 6290}, {"loss": 1.2487, "grad_norm": 1.000300645828247, "learning_rate": 0.0002, "epoch": 5.287452790600084, "step": 6300}, {"loss": 1.3214, "grad_norm": 0.9821932911872864, "learning_rate": 0.0002, "epoch": 5.295845572807385, "step": 6310}, {"loss": 1.2964, "grad_norm": 1.0103896856307983, "learning_rate": 0.0002, "epoch": 5.304238355014688, "step": 6320}, {"loss": 1.2497, "grad_norm": 0.9323601722717285, "learning_rate": 0.0002, "epoch": 5.312631137221989, "step": 6330}, {"loss": 1.3165, "grad_norm": 1.0668879747390747, "learning_rate": 0.0002, "epoch": 5.321023919429291, "step": 6340}, {"loss": 1.2411, "grad_norm": 0.9666323065757751, "learning_rate": 0.0002, "epoch": 5.3294167016365925, "step": 6350}, {"loss": 1.2129, "grad_norm": 0.9439574480056763, "learning_rate": 0.0002, "epoch": 5.337809483843894, "step": 6360}, {"loss": 1.2355, "grad_norm": 1.0229361057281494, "learning_rate": 0.0002, "epoch": 5.346202266051196, "step": 6370}, {"loss": 1.2021, "grad_norm": 0.8522404432296753, "learning_rate": 0.0002, "epoch": 5.354595048258497, "step": 6380}, {"loss": 1.32, "grad_norm": 1.3732287883758545, "learning_rate": 0.0002, "epoch": 5.3629878304658, "step": 6390}, {"loss": 1.1987, "grad_norm": 0.8201091885566711, "learning_rate": 0.0002, "epoch": 5.371380612673101, "step": 6400}, {"loss": 1.2867, "grad_norm": 0.8874436616897583, "learning_rate": 0.0002, "epoch": 5.379773394880403, "step": 6410}, {"loss": 1.2686, "grad_norm": 1.0118640661239624, "learning_rate": 0.0002, "epoch": 5.3881661770877045, "step": 6420}, {"loss": 1.2952, "grad_norm": 1.0468370914459229, "learning_rate": 0.0002, "epoch": 5.396558959295007, "step": 6430}, {"loss": 1.2057, "grad_norm": 0.941806972026825, "learning_rate": 0.0002, "epoch": 5.404951741502308, "step": 6440}, {"loss": 1.3289, "grad_norm": 0.9860424399375916, "learning_rate": 0.0002, "epoch": 5.413344523709609, "step": 6450}, {"loss": 1.2887, "grad_norm": 1.009628176689148, "learning_rate": 0.0002, "epoch": 5.421737305916912, "step": 6460}, {"loss": 1.2544, "grad_norm": 0.9842159748077393, "learning_rate": 0.0002, "epoch": 5.430130088124213, "step": 6470}, {"loss": 1.2277, "grad_norm": 0.9935571551322937, "learning_rate": 0.0002, "epoch": 5.438522870331515, "step": 6480}, {"loss": 1.2392, "grad_norm": 0.8872362971305847, "learning_rate": 0.0002, "epoch": 5.4469156525388165, "step": 6490}, {"loss": 1.2166, "grad_norm": 0.9530836939811707, "learning_rate": 0.0002, "epoch": 5.455308434746119, "step": 6500}, {"loss": 1.2138, "grad_norm": 0.8111279010772705, "learning_rate": 0.0002, "epoch": 5.46370121695342, "step": 6510}, {"loss": 1.2375, "grad_norm": 1.0474516153335571, "learning_rate": 0.0002, "epoch": 5.472093999160721, "step": 6520}, {"loss": 1.2752, "grad_norm": 1.0228482484817505, "learning_rate": 0.0002, "epoch": 5.480486781368024, "step": 6530}, {"loss": 1.2739, "grad_norm": 1.0299347639083862, "learning_rate": 0.0002, "epoch": 5.488879563575325, "step": 6540}, {"loss": 1.3163, "grad_norm": 0.9105098247528076, "learning_rate": 0.0002, "epoch": 5.497272345782627, "step": 6550}, {"loss": 1.2718, "grad_norm": 1.2459523677825928, "learning_rate": 0.0002, "epoch": 5.5056651279899285, "step": 6560}, {"loss": 1.2697, "grad_norm": 1.0630481243133545, "learning_rate": 0.0002, "epoch": 5.514057910197231, "step": 6570}, {"loss": 1.3003, "grad_norm": 0.8310980796813965, "learning_rate": 0.0002, "epoch": 5.522450692404532, "step": 6580}, {"loss": 1.1855, "grad_norm": 1.102723479270935, "learning_rate": 0.0002, "epoch": 5.530843474611833, "step": 6590}, {"loss": 1.2889, "grad_norm": 0.9586807489395142, "learning_rate": 0.0002, "epoch": 5.539236256819136, "step": 6600}, {"loss": 1.2899, "grad_norm": 0.976191520690918, "learning_rate": 0.0002, "epoch": 5.547629039026437, "step": 6610}, {"loss": 1.2319, "grad_norm": 0.9943762421607971, "learning_rate": 0.0002, "epoch": 5.556021821233739, "step": 6620}, {"loss": 1.3103, "grad_norm": 0.8788089156150818, "learning_rate": 0.0002, "epoch": 5.56441460344104, "step": 6630}, {"loss": 1.1982, "grad_norm": 0.9866173267364502, "learning_rate": 0.0002, "epoch": 5.572807385648343, "step": 6640}, {"loss": 1.2686, "grad_norm": 1.0791642665863037, "learning_rate": 0.0002, "epoch": 5.581200167855644, "step": 6650}, {"loss": 1.2806, "grad_norm": 0.836482584476471, "learning_rate": 0.0002, "epoch": 5.589592950062946, "step": 6660}, {"loss": 1.3114, "grad_norm": 0.9841130971908569, "learning_rate": 0.0002, "epoch": 5.5979857322702475, "step": 6670}, {"loss": 1.2323, "grad_norm": 0.9678813815116882, "learning_rate": 0.0002, "epoch": 5.60637851447755, "step": 6680}, {"loss": 1.1969, "grad_norm": 0.9033233523368835, "learning_rate": 0.0002, "epoch": 5.614771296684851, "step": 6690}, {"loss": 1.2565, "grad_norm": 0.8691515922546387, "learning_rate": 0.0002, "epoch": 5.623164078892152, "step": 6700}, {"loss": 1.2678, "grad_norm": 0.8971360921859741, "learning_rate": 0.0002, "epoch": 5.631556861099455, "step": 6710}, {"loss": 1.2266, "grad_norm": 0.9377756118774414, "learning_rate": 0.0002, "epoch": 5.639949643306756, "step": 6720}, {"loss": 1.28, "grad_norm": 0.908762514591217, "learning_rate": 0.0002, "epoch": 5.648342425514058, "step": 6730}, {"loss": 1.2499, "grad_norm": 1.0503337383270264, "learning_rate": 0.0002, "epoch": 5.6567352077213595, "step": 6740}, {"loss": 1.3604, "grad_norm": 1.030267357826233, "learning_rate": 0.0002, "epoch": 5.665127989928662, "step": 6750}, {"loss": 1.2223, "grad_norm": 0.9150485992431641, "learning_rate": 0.0002, "epoch": 5.673520772135963, "step": 6760}, {"loss": 1.2651, "grad_norm": 1.0300343036651611, "learning_rate": 0.0002, "epoch": 5.681913554343264, "step": 6770}, {"loss": 1.2506, "grad_norm": 1.1242924928665161, "learning_rate": 0.0002, "epoch": 5.690306336550567, "step": 6780}, {"loss": 1.3318, "grad_norm": 0.9489498138427734, "learning_rate": 0.0002, "epoch": 5.698699118757868, "step": 6790}, {"loss": 1.2578, "grad_norm": 0.8829707503318787, "learning_rate": 0.0002, "epoch": 5.70709190096517, "step": 6800}, {"loss": 1.2765, "grad_norm": 1.01392662525177, "learning_rate": 0.0002, "epoch": 5.7154846831724715, "step": 6810}, {"loss": 1.3029, "grad_norm": 0.9234510064125061, "learning_rate": 0.0002, "epoch": 5.723877465379774, "step": 6820}, {"loss": 1.2891, "grad_norm": 0.9439187049865723, "learning_rate": 0.0002, "epoch": 5.732270247587075, "step": 6830}, {"loss": 1.2627, "grad_norm": 0.8833441734313965, "learning_rate": 0.0002, "epoch": 5.740663029794376, "step": 6840}, {"loss": 1.3195, "grad_norm": 0.9394439458847046, "learning_rate": 0.0002, "epoch": 5.749055812001679, "step": 6850}, {"loss": 1.3108, "grad_norm": 0.9980010390281677, "learning_rate": 0.0002, "epoch": 5.75744859420898, "step": 6860}, {"loss": 1.2958, "grad_norm": 0.9612377882003784, "learning_rate": 0.0002, "epoch": 5.765841376416282, "step": 6870}, {"loss": 1.2173, "grad_norm": 1.0817323923110962, "learning_rate": 0.0002, "epoch": 5.7742341586235835, "step": 6880}, {"loss": 1.2485, "grad_norm": 0.8445103168487549, "learning_rate": 0.0002, "epoch": 5.782626940830886, "step": 6890}, {"loss": 1.2573, "grad_norm": 0.8535459041595459, "learning_rate": 0.0002, "epoch": 5.791019723038187, "step": 6900}, {"loss": 1.2729, "grad_norm": 0.9131284356117249, "learning_rate": 0.0002, "epoch": 5.799412505245489, "step": 6910}, {"loss": 1.1934, "grad_norm": 0.8627726435661316, "learning_rate": 0.0002, "epoch": 5.807805287452791, "step": 6920}, {"loss": 1.3226, "grad_norm": 0.8599951863288879, "learning_rate": 0.0002, "epoch": 5.816198069660093, "step": 6930}, {"loss": 1.3078, "grad_norm": 1.0746861696243286, "learning_rate": 0.0002, "epoch": 5.824590851867394, "step": 6940}, {"loss": 1.2653, "grad_norm": 1.0220543146133423, "learning_rate": 0.0002, "epoch": 5.8329836340746954, "step": 6950}, {"loss": 1.3168, "grad_norm": 0.8891388177871704, "learning_rate": 0.0002, "epoch": 5.841376416281998, "step": 6960}, {"loss": 1.2845, "grad_norm": 1.1404683589935303, "learning_rate": 0.0002, "epoch": 5.849769198489299, "step": 6970}, {"loss": 1.2361, "grad_norm": 0.9665380120277405, "learning_rate": 0.0002, "epoch": 5.858161980696601, "step": 6980}, {"loss": 1.2622, "grad_norm": 0.9837968945503235, "learning_rate": 0.0002, "epoch": 5.8665547629039025, "step": 6990}, {"loss": 1.2973, "grad_norm": 1.0278598070144653, "learning_rate": 0.0002, "epoch": 5.874947545111205, "step": 7000}, {"loss": 1.2334, "grad_norm": 0.9990253448486328, "learning_rate": 0.0002, "epoch": 5.883340327318506, "step": 7010}, {"loss": 1.3508, "grad_norm": 0.9705647230148315, "learning_rate": 0.0002, "epoch": 5.891733109525807, "step": 7020}, {"loss": 1.335, "grad_norm": 0.9672252535820007, "learning_rate": 0.0002, "epoch": 5.90012589173311, "step": 7030}, {"loss": 1.2944, "grad_norm": 0.9467034339904785, "learning_rate": 0.0002, "epoch": 5.908518673940411, "step": 7040}, {"loss": 1.2704, "grad_norm": 0.9506469964981079, "learning_rate": 0.0002, "epoch": 5.916911456147713, "step": 7050}, {"loss": 1.2745, "grad_norm": 0.8936163783073425, "learning_rate": 0.0002, "epoch": 5.9253042383550145, "step": 7060}, {"loss": 1.2702, "grad_norm": 0.956101655960083, "learning_rate": 0.0002, "epoch": 5.933697020562317, "step": 7070}, {"loss": 1.2532, "grad_norm": 0.893535852432251, "learning_rate": 0.0002, "epoch": 5.942089802769618, "step": 7080}, {"loss": 1.342, "grad_norm": 1.0313799381256104, "learning_rate": 0.0002, "epoch": 5.950482584976919, "step": 7090}, {"loss": 1.3398, "grad_norm": 0.8567915558815002, "learning_rate": 0.0002, "epoch": 5.958875367184222, "step": 7100}, {"loss": 1.3127, "grad_norm": 0.9683501720428467, "learning_rate": 0.0002, "epoch": 5.967268149391523, "step": 7110}, {"loss": 1.2522, "grad_norm": 0.9401984214782715, "learning_rate": 0.0002, "epoch": 5.975660931598825, "step": 7120}, {"loss": 1.3211, "grad_norm": 1.0316764116287231, "learning_rate": 0.0002, "epoch": 5.9840537138061265, "step": 7130}, {"loss": 1.2445, "grad_norm": 0.9335392713546753, "learning_rate": 0.0002, "epoch": 5.992446496013429, "step": 7140}, {"eval_loss": 2.041194438934326, "eval_runtime": 37.9642, "eval_samples_per_second": 13.565, "eval_steps_per_second": 1.712, "epoch": 6.0, "step": 7149}, {"loss": 1.2531, "grad_norm": 1.0247591733932495, "learning_rate": 0.0002, "epoch": 6.00083927822073, "step": 7150}, {"loss": 1.1125, "grad_norm": 1.4086190462112427, "learning_rate": 0.0002, "epoch": 6.009232060428032, "step": 7160}, {"loss": 1.0702, "grad_norm": 1.0636897087097168, "learning_rate": 0.0002, "epoch": 6.017624842635334, "step": 7170}, {"loss": 1.118, "grad_norm": 1.1334257125854492, "learning_rate": 0.0002, "epoch": 6.026017624842635, "step": 7180}, {"loss": 1.0428, "grad_norm": 1.1142425537109375, "learning_rate": 0.0002, "epoch": 6.034410407049937, "step": 7190}, {"loss": 1.0439, "grad_norm": 1.1448479890823364, "learning_rate": 0.0002, "epoch": 6.0428031892572385, "step": 7200}, {"loss": 1.0364, "grad_norm": 1.181567907333374, "learning_rate": 0.0002, "epoch": 6.051195971464541, "step": 7210}, {"loss": 1.0435, "grad_norm": 1.0471529960632324, "learning_rate": 0.0002, "epoch": 6.059588753671842, "step": 7220}, {"loss": 1.0828, "grad_norm": 1.1432698965072632, "learning_rate": 0.0002, "epoch": 6.067981535879144, "step": 7230}, {"loss": 1.095, "grad_norm": 1.1316763162612915, "learning_rate": 0.0002, "epoch": 6.076374318086446, "step": 7240}, {"loss": 1.0767, "grad_norm": 0.9800271391868591, "learning_rate": 0.0002, "epoch": 6.084767100293748, "step": 7250}, {"loss": 1.0984, "grad_norm": 1.1878576278686523, "learning_rate": 0.0002, "epoch": 6.093159882501049, "step": 7260}, {"loss": 1.1225, "grad_norm": 1.0174267292022705, "learning_rate": 0.0002, "epoch": 6.1015526647083504, "step": 7270}, {"loss": 1.0747, "grad_norm": 0.9622059464454651, "learning_rate": 0.0002, "epoch": 6.109945446915653, "step": 7280}, {"loss": 1.1606, "grad_norm": 1.3247325420379639, "learning_rate": 0.0002, "epoch": 6.118338229122954, "step": 7290}, {"loss": 1.0533, "grad_norm": 1.2405189275741577, "learning_rate": 0.0002, "epoch": 6.126731011330256, "step": 7300}, {"loss": 1.1345, "grad_norm": 1.025123953819275, "learning_rate": 0.0002, "epoch": 6.1351237935375575, "step": 7310}, {"loss": 1.0879, "grad_norm": 1.2966125011444092, "learning_rate": 0.0002, "epoch": 6.14351657574486, "step": 7320}, {"loss": 1.106, "grad_norm": 1.0655252933502197, "learning_rate": 0.0002, "epoch": 6.151909357952161, "step": 7330}, {"loss": 1.1089, "grad_norm": 1.076251745223999, "learning_rate": 0.0002, "epoch": 6.160302140159462, "step": 7340}, {"loss": 1.1144, "grad_norm": 1.0632140636444092, "learning_rate": 0.0002, "epoch": 6.168694922366765, "step": 7350}, {"loss": 1.1284, "grad_norm": 1.392654538154602, "learning_rate": 0.0002, "epoch": 6.177087704574066, "step": 7360}, {"loss": 1.0909, "grad_norm": 1.071683645248413, "learning_rate": 0.0002, "epoch": 6.185480486781368, "step": 7370}, {"loss": 1.1041, "grad_norm": 1.0602295398712158, "learning_rate": 0.0002, "epoch": 6.1938732689886695, "step": 7380}, {"loss": 1.083, "grad_norm": 1.2152365446090698, "learning_rate": 0.0002, "epoch": 6.202266051195972, "step": 7390}, {"loss": 1.0622, "grad_norm": 1.1637049913406372, "learning_rate": 0.0002, "epoch": 6.210658833403273, "step": 7400}, {"loss": 1.1107, "grad_norm": 1.3976062536239624, "learning_rate": 0.0002, "epoch": 6.219051615610575, "step": 7410}, {"loss": 1.084, "grad_norm": 1.1892462968826294, "learning_rate": 0.0002, "epoch": 6.227444397817877, "step": 7420}, {"loss": 1.0517, "grad_norm": 1.23629629611969, "learning_rate": 0.0002, "epoch": 6.235837180025178, "step": 7430}, {"loss": 1.1069, "grad_norm": 1.2072324752807617, "learning_rate": 0.0002, "epoch": 6.24422996223248, "step": 7440}, {"loss": 1.172, "grad_norm": 1.2027140855789185, "learning_rate": 0.0002, "epoch": 6.2526227444397815, "step": 7450}, {"loss": 1.0373, "grad_norm": 1.2129466533660889, "learning_rate": 0.0002, "epoch": 6.261015526647084, "step": 7460}, {"loss": 1.1493, "grad_norm": 1.1675773859024048, "learning_rate": 0.0002, "epoch": 6.269408308854385, "step": 7470}, {"loss": 1.0884, "grad_norm": 1.189106822013855, "learning_rate": 0.0002, "epoch": 6.277801091061687, "step": 7480}, {"loss": 1.1557, "grad_norm": 0.9968156218528748, "learning_rate": 0.0002, "epoch": 6.286193873268989, "step": 7490}, {"loss": 1.1816, "grad_norm": 1.2140403985977173, "learning_rate": 0.0002, "epoch": 6.294586655476291, "step": 7500}, {"loss": 1.1163, "grad_norm": 1.1790717840194702, "learning_rate": 0.0002, "epoch": 6.302979437683592, "step": 7510}, {"loss": 1.114, "grad_norm": 1.1867438554763794, "learning_rate": 0.0002, "epoch": 6.3113722198908935, "step": 7520}, {"loss": 1.1697, "grad_norm": 1.2212399244308472, "learning_rate": 0.0002, "epoch": 6.319765002098196, "step": 7530}, {"loss": 1.1103, "grad_norm": 1.1840152740478516, "learning_rate": 0.0002, "epoch": 6.328157784305497, "step": 7540}, {"loss": 1.015, "grad_norm": 1.1392520666122437, "learning_rate": 0.0002, "epoch": 6.336550566512799, "step": 7550}, {"loss": 1.1686, "grad_norm": 1.2683428525924683, "learning_rate": 0.0002, "epoch": 6.344943348720101, "step": 7560}, {"loss": 1.1221, "grad_norm": 1.2927075624465942, "learning_rate": 0.0002, "epoch": 6.353336130927403, "step": 7570}, {"loss": 1.1728, "grad_norm": 1.1633557081222534, "learning_rate": 0.0002, "epoch": 6.361728913134704, "step": 7580}, {"loss": 1.0448, "grad_norm": 1.2839789390563965, "learning_rate": 0.0002, "epoch": 6.3701216953420055, "step": 7590}, {"loss": 1.0679, "grad_norm": 1.1563365459442139, "learning_rate": 0.0002, "epoch": 6.378514477549308, "step": 7600}, {"loss": 1.1222, "grad_norm": 1.3075823783874512, "learning_rate": 0.0002, "epoch": 6.386907259756609, "step": 7610}, {"loss": 1.1872, "grad_norm": 1.1148593425750732, "learning_rate": 0.0002, "epoch": 6.395300041963911, "step": 7620}, {"loss": 1.1296, "grad_norm": 1.3017758131027222, "learning_rate": 0.0002, "epoch": 6.4036928241712125, "step": 7630}, {"loss": 1.0982, "grad_norm": 1.3302847146987915, "learning_rate": 0.0002, "epoch": 6.412085606378515, "step": 7640}, {"loss": 1.1228, "grad_norm": 1.3263767957687378, "learning_rate": 0.0002, "epoch": 6.420478388585816, "step": 7650}, {"loss": 1.1036, "grad_norm": 1.2079416513442993, "learning_rate": 0.0002, "epoch": 6.428871170793118, "step": 7660}, {"loss": 1.0885, "grad_norm": 1.1282644271850586, "learning_rate": 0.0002, "epoch": 6.43726395300042, "step": 7670}, {"loss": 1.1437, "grad_norm": 1.1894482374191284, "learning_rate": 0.0002, "epoch": 6.445656735207721, "step": 7680}, {"loss": 1.1531, "grad_norm": 1.2007642984390259, "learning_rate": 0.0002, "epoch": 6.454049517415023, "step": 7690}, {"loss": 1.1639, "grad_norm": 1.3172780275344849, "learning_rate": 0.0002, "epoch": 6.4624422996223245, "step": 7700}, {"loss": 1.1477, "grad_norm": 1.113945722579956, "learning_rate": 0.0002, "epoch": 6.470835081829627, "step": 7710}, {"loss": 1.0852, "grad_norm": 1.1763832569122314, "learning_rate": 0.0002, "epoch": 6.479227864036928, "step": 7720}, {"loss": 1.1121, "grad_norm": 1.196928858757019, "learning_rate": 0.0002, "epoch": 6.48762064624423, "step": 7730}, {"loss": 1.1736, "grad_norm": 1.2109456062316895, "learning_rate": 0.0002, "epoch": 6.496013428451532, "step": 7740}, {"loss": 1.1575, "grad_norm": 1.3580254316329956, "learning_rate": 0.0002, "epoch": 6.504406210658834, "step": 7750}, {"loss": 1.0606, "grad_norm": 1.0432099103927612, "learning_rate": 0.0002, "epoch": 6.512798992866135, "step": 7760}, {"loss": 1.1453, "grad_norm": 1.0125840902328491, "learning_rate": 0.0002, "epoch": 6.5211917750734365, "step": 7770}, {"loss": 1.1112, "grad_norm": 1.5847094058990479, "learning_rate": 0.0002, "epoch": 6.529584557280739, "step": 7780}, {"loss": 1.0885, "grad_norm": 1.161391258239746, "learning_rate": 0.0002, "epoch": 6.53797733948804, "step": 7790}, {"loss": 1.1549, "grad_norm": 1.1106663942337036, "learning_rate": 0.0002, "epoch": 6.546370121695342, "step": 7800}, {"loss": 1.0584, "grad_norm": 1.2467689514160156, "learning_rate": 0.0002, "epoch": 6.554762903902644, "step": 7810}, {"loss": 1.0923, "grad_norm": 1.1907767057418823, "learning_rate": 0.0002, "epoch": 6.563155686109946, "step": 7820}, {"loss": 1.1606, "grad_norm": 1.1521105766296387, "learning_rate": 0.0002, "epoch": 6.571548468317247, "step": 7830}, {"loss": 1.1644, "grad_norm": 1.2498128414154053, "learning_rate": 0.0002, "epoch": 6.5799412505245485, "step": 7840}, {"loss": 1.0948, "grad_norm": 1.1506036520004272, "learning_rate": 0.0002, "epoch": 6.588334032731851, "step": 7850}, {"loss": 1.1499, "grad_norm": 1.118890404701233, "learning_rate": 0.0002, "epoch": 6.596726814939152, "step": 7860}, {"loss": 1.1352, "grad_norm": 1.1001442670822144, "learning_rate": 0.0002, "epoch": 6.605119597146454, "step": 7870}, {"loss": 1.1139, "grad_norm": 1.1551518440246582, "learning_rate": 0.0002, "epoch": 6.613512379353756, "step": 7880}, {"loss": 1.1255, "grad_norm": 1.1872174739837646, "learning_rate": 0.0002, "epoch": 6.621905161561058, "step": 7890}, {"loss": 1.1013, "grad_norm": 1.1665245294570923, "learning_rate": 0.0002, "epoch": 6.630297943768359, "step": 7900}, {"loss": 1.1857, "grad_norm": 1.1592308282852173, "learning_rate": 0.0002, "epoch": 6.6386907259756605, "step": 7910}, {"loss": 1.1639, "grad_norm": 1.2712409496307373, "learning_rate": 0.0002, "epoch": 6.647083508182963, "step": 7920}, {"loss": 1.147, "grad_norm": 1.0665934085845947, "learning_rate": 0.0002, "epoch": 6.655476290390264, "step": 7930}, {"loss": 1.1437, "grad_norm": 1.1843419075012207, "learning_rate": 0.0002, "epoch": 6.663869072597566, "step": 7940}, {"loss": 1.1359, "grad_norm": 1.4945712089538574, "learning_rate": 0.0002, "epoch": 6.6722618548048676, "step": 7950}, {"loss": 1.1772, "grad_norm": 1.3284149169921875, "learning_rate": 0.0002, "epoch": 6.68065463701217, "step": 7960}, {"loss": 1.1183, "grad_norm": 1.1670401096343994, "learning_rate": 0.0002, "epoch": 6.689047419219471, "step": 7970}, {"loss": 1.1808, "grad_norm": 1.1963475942611694, "learning_rate": 0.0002, "epoch": 6.697440201426773, "step": 7980}, {"loss": 1.1489, "grad_norm": 1.077380657196045, "learning_rate": 0.0002, "epoch": 6.705832983634075, "step": 7990}, {"loss": 1.1661, "grad_norm": 0.8758405447006226, "learning_rate": 0.0002, "epoch": 6.714225765841377, "step": 8000}, {"loss": 1.169, "grad_norm": 1.2686632871627808, "learning_rate": 0.0002, "epoch": 6.722618548048678, "step": 8010}, {"loss": 1.1486, "grad_norm": 1.1136665344238281, "learning_rate": 0.0002, "epoch": 6.7310113302559795, "step": 8020}, {"loss": 1.1439, "grad_norm": 1.25029456615448, "learning_rate": 0.0002, "epoch": 6.739404112463282, "step": 8030}, {"loss": 1.1121, "grad_norm": 1.0269629955291748, "learning_rate": 0.0002, "epoch": 6.747796894670583, "step": 8040}, {"loss": 1.1707, "grad_norm": 1.1515758037567139, "learning_rate": 0.0002, "epoch": 6.756189676877885, "step": 8050}, {"loss": 1.1487, "grad_norm": 1.1150308847427368, "learning_rate": 0.0002, "epoch": 6.764582459085187, "step": 8060}, {"loss": 1.088, "grad_norm": 1.025669813156128, "learning_rate": 0.0002, "epoch": 6.772975241292489, "step": 8070}, {"loss": 1.1002, "grad_norm": 1.0564825534820557, "learning_rate": 0.0002, "epoch": 6.78136802349979, "step": 8080}, {"loss": 1.1722, "grad_norm": 1.1695157289505005, "learning_rate": 0.0002, "epoch": 6.7897608057070915, "step": 8090}, {"loss": 1.1322, "grad_norm": 1.1086713075637817, "learning_rate": 0.0002, "epoch": 6.798153587914394, "step": 8100}, {"loss": 1.2036, "grad_norm": 1.0446662902832031, "learning_rate": 0.0002, "epoch": 6.806546370121695, "step": 8110}, {"loss": 1.1106, "grad_norm": 1.2017868757247925, "learning_rate": 0.0002, "epoch": 6.814939152328997, "step": 8120}, {"loss": 1.1316, "grad_norm": 1.2538378238677979, "learning_rate": 0.0002, "epoch": 6.823331934536299, "step": 8130}, {"loss": 1.1506, "grad_norm": 1.1552783250808716, "learning_rate": 0.0002, "epoch": 6.831724716743601, "step": 8140}, {"loss": 1.1623, "grad_norm": 1.2151418924331665, "learning_rate": 0.0002, "epoch": 6.840117498950902, "step": 8150}, {"loss": 1.121, "grad_norm": 1.1431301832199097, "learning_rate": 0.0002, "epoch": 6.8485102811582035, "step": 8160}, {"loss": 1.1312, "grad_norm": 1.0864715576171875, "learning_rate": 0.0002, "epoch": 6.856903063365506, "step": 8170}, {"loss": 1.1777, "grad_norm": 1.2602605819702148, "learning_rate": 0.0002, "epoch": 6.865295845572807, "step": 8180}, {"loss": 1.1237, "grad_norm": 1.1670788526535034, "learning_rate": 0.0002, "epoch": 6.873688627780109, "step": 8190}, {"loss": 1.1728, "grad_norm": 1.1444851160049438, "learning_rate": 0.0002, "epoch": 6.882081409987411, "step": 8200}, {"loss": 1.1208, "grad_norm": 1.1726973056793213, "learning_rate": 0.0002, "epoch": 6.890474192194713, "step": 8210}, {"loss": 1.1666, "grad_norm": 1.0436229705810547, "learning_rate": 0.0002, "epoch": 6.898866974402014, "step": 8220}, {"loss": 1.097, "grad_norm": 1.3296568393707275, "learning_rate": 0.0002, "epoch": 6.907259756609316, "step": 8230}, {"loss": 1.0581, "grad_norm": 1.2561821937561035, "learning_rate": 0.0002, "epoch": 6.915652538816618, "step": 8240}, {"loss": 1.2125, "grad_norm": 1.2071776390075684, "learning_rate": 0.0002, "epoch": 6.92404532102392, "step": 8250}, {"loss": 1.1433, "grad_norm": 1.115523099899292, "learning_rate": 0.0002, "epoch": 6.932438103231221, "step": 8260}, {"loss": 1.2104, "grad_norm": 1.145468831062317, "learning_rate": 0.0002, "epoch": 6.940830885438523, "step": 8270}, {"loss": 1.1654, "grad_norm": 1.2517759799957275, "learning_rate": 0.0002, "epoch": 6.949223667645825, "step": 8280}, {"loss": 1.0968, "grad_norm": 1.1757365465164185, "learning_rate": 0.0002, "epoch": 6.957616449853126, "step": 8290}, {"loss": 1.1899, "grad_norm": 1.0645636320114136, "learning_rate": 0.0002, "epoch": 6.966009232060428, "step": 8300}, {"loss": 1.2665, "grad_norm": 1.2390278577804565, "learning_rate": 0.0002, "epoch": 6.97440201426773, "step": 8310}, {"loss": 1.1491, "grad_norm": 1.202418327331543, "learning_rate": 0.0002, "epoch": 6.982794796475032, "step": 8320}, {"loss": 1.1722, "grad_norm": 1.0840344429016113, "learning_rate": 0.0002, "epoch": 6.991187578682333, "step": 8330}, {"loss": 1.1172, "grad_norm": 1.2504760026931763, "learning_rate": 0.0002, "epoch": 6.9995803608896345, "step": 8340}, {"eval_loss": 2.1729838848114014, "eval_runtime": 37.9703, "eval_samples_per_second": 13.563, "eval_steps_per_second": 1.712, "epoch": 6.9995803608896345, "step": 8340}, {"loss": 0.9518, "grad_norm": 1.3072566986083984, "learning_rate": 0.0002, "epoch": 7.007973143096937, "step": 8350}, {"loss": 0.9095, "grad_norm": 1.4257196187973022, "learning_rate": 0.0002, "epoch": 7.016365925304238, "step": 8360}, {"loss": 0.96, "grad_norm": 1.2966243028640747, "learning_rate": 0.0002, "epoch": 7.02475870751154, "step": 8370}, {"loss": 0.992, "grad_norm": 1.3083164691925049, "learning_rate": 0.0002, "epoch": 7.033151489718842, "step": 8380}, {"loss": 0.9083, "grad_norm": 1.2210543155670166, "learning_rate": 0.0002, "epoch": 7.041544271926144, "step": 8390}, {"loss": 0.9794, "grad_norm": 1.1458159685134888, "learning_rate": 0.0002, "epoch": 7.049937054133445, "step": 8400}, {"loss": 0.9451, "grad_norm": 1.4605761766433716, "learning_rate": 0.0002, "epoch": 7.0583298363407465, "step": 8410}, {"loss": 0.929, "grad_norm": 1.435689091682434, "learning_rate": 0.0002, "epoch": 7.066722618548049, "step": 8420}, {"loss": 0.9328, "grad_norm": 1.4071106910705566, "learning_rate": 0.0002, "epoch": 7.07511540075535, "step": 8430}, {"loss": 1.0118, "grad_norm": 1.2787632942199707, "learning_rate": 0.0002, "epoch": 7.083508182962652, "step": 8440}, {"loss": 0.8974, "grad_norm": 1.4746837615966797, "learning_rate": 0.0002, "epoch": 7.091900965169954, "step": 8450}, {"loss": 0.9022, "grad_norm": 1.5315444469451904, "learning_rate": 0.0002, "epoch": 7.100293747377256, "step": 8460}, {"loss": 0.9211, "grad_norm": 1.3477388620376587, "learning_rate": 0.0002, "epoch": 7.108686529584557, "step": 8470}, {"loss": 0.9362, "grad_norm": 1.4741411209106445, "learning_rate": 0.0002, "epoch": 7.117079311791859, "step": 8480}, {"loss": 0.981, "grad_norm": 1.4285027980804443, "learning_rate": 0.0002, "epoch": 7.125472093999161, "step": 8490}, {"loss": 0.9618, "grad_norm": 1.4621654748916626, "learning_rate": 0.0002, "epoch": 7.133864876206462, "step": 8500}, {"loss": 1.009, "grad_norm": 1.5798449516296387, "learning_rate": 0.0002, "epoch": 7.142257658413764, "step": 8510}, {"loss": 0.974, "grad_norm": 1.5122318267822266, "learning_rate": 0.0002, "epoch": 7.150650440621066, "step": 8520}, {"loss": 0.8893, "grad_norm": 1.1761255264282227, "learning_rate": 0.0002, "epoch": 7.159043222828368, "step": 8530}, {"loss": 0.9484, "grad_norm": 1.225748062133789, "learning_rate": 0.0002, "epoch": 7.167436005035669, "step": 8540}, {"loss": 0.9951, "grad_norm": 1.2034697532653809, "learning_rate": 0.0002, "epoch": 7.175828787242971, "step": 8550}, {"loss": 1.0185, "grad_norm": 1.3965253829956055, "learning_rate": 0.0002, "epoch": 7.184221569450273, "step": 8560}, {"loss": 0.9591, "grad_norm": 1.5653856992721558, "learning_rate": 0.0002, "epoch": 7.192614351657575, "step": 8570}, {"loss": 0.9621, "grad_norm": 1.132654070854187, "learning_rate": 0.0002, "epoch": 7.201007133864876, "step": 8580}, {"loss": 0.9662, "grad_norm": 1.563130497932434, "learning_rate": 0.0002, "epoch": 7.209399916072178, "step": 8590}, {"loss": 0.9575, "grad_norm": 1.4901666641235352, "learning_rate": 0.0002, "epoch": 7.21779269827948, "step": 8600}, {"loss": 0.9401, "grad_norm": 1.2369494438171387, "learning_rate": 0.0002, "epoch": 7.226185480486781, "step": 8610}, {"loss": 0.9773, "grad_norm": 1.2923214435577393, "learning_rate": 0.0002, "epoch": 7.234578262694083, "step": 8620}, {"loss": 0.9497, "grad_norm": 1.3038378953933716, "learning_rate": 0.0002, "epoch": 7.242971044901385, "step": 8630}, {"loss": 0.9361, "grad_norm": 1.4016213417053223, "learning_rate": 0.0002, "epoch": 7.251363827108687, "step": 8640}, {"loss": 1.0123, "grad_norm": 1.3319065570831299, "learning_rate": 0.0002, "epoch": 7.259756609315988, "step": 8650}, {"loss": 0.9359, "grad_norm": 1.5870885848999023, "learning_rate": 0.0002, "epoch": 7.2681493915232895, "step": 8660}, {"loss": 0.8986, "grad_norm": 1.269951581954956, "learning_rate": 0.0002, "epoch": 7.276542173730592, "step": 8670}, {"loss": 0.8962, "grad_norm": 1.6408095359802246, "learning_rate": 0.0002, "epoch": 7.284934955937893, "step": 8680}, {"loss": 1.0012, "grad_norm": 1.492431402206421, "learning_rate": 0.0002, "epoch": 7.293327738145195, "step": 8690}, {"loss": 0.9855, "grad_norm": 1.5359779596328735, "learning_rate": 0.0002, "epoch": 7.301720520352497, "step": 8700}, {"loss": 0.9732, "grad_norm": 1.3436894416809082, "learning_rate": 0.0002, "epoch": 7.310113302559799, "step": 8710}, {"loss": 1.0006, "grad_norm": 1.272531270980835, "learning_rate": 0.0002, "epoch": 7.3185060847671, "step": 8720}, {"loss": 0.9387, "grad_norm": 1.2252386808395386, "learning_rate": 0.0002, "epoch": 7.326898866974402, "step": 8730}, {"loss": 0.9543, "grad_norm": 1.7674977779388428, "learning_rate": 0.0002, "epoch": 7.335291649181704, "step": 8740}, {"loss": 0.9254, "grad_norm": 1.4869602918624878, "learning_rate": 0.0002, "epoch": 7.343684431389005, "step": 8750}, {"loss": 0.9397, "grad_norm": 1.7059985399246216, "learning_rate": 0.0002, "epoch": 7.352077213596307, "step": 8760}, {"loss": 1.0114, "grad_norm": 1.4273415803909302, "learning_rate": 0.0002, "epoch": 7.360469995803609, "step": 8770}, {"loss": 0.9991, "grad_norm": 1.5042296648025513, "learning_rate": 0.0002, "epoch": 7.368862778010911, "step": 8780}, {"loss": 0.9949, "grad_norm": 1.3052846193313599, "learning_rate": 0.0002, "epoch": 7.377255560218212, "step": 8790}, {"loss": 0.9305, "grad_norm": 1.2968711853027344, "learning_rate": 0.0002, "epoch": 7.385648342425514, "step": 8800}, {"loss": 1.0534, "grad_norm": 1.3339134454727173, "learning_rate": 0.0002, "epoch": 7.394041124632816, "step": 8810}, {"loss": 1.002, "grad_norm": 1.4598830938339233, "learning_rate": 0.0002, "epoch": 7.402433906840118, "step": 8820}, {"loss": 1.0351, "grad_norm": 1.408402442932129, "learning_rate": 0.0002, "epoch": 7.410826689047419, "step": 8830}, {"loss": 1.005, "grad_norm": 1.515499472618103, "learning_rate": 0.0002, "epoch": 7.419219471254721, "step": 8840}, {"loss": 0.9835, "grad_norm": 1.4303524494171143, "learning_rate": 0.0002, "epoch": 7.427612253462023, "step": 8850}, {"loss": 1.0585, "grad_norm": 1.2982665300369263, "learning_rate": 0.0002, "epoch": 7.436005035669324, "step": 8860}, {"loss": 0.969, "grad_norm": 1.300026774406433, "learning_rate": 0.0002, "epoch": 7.444397817876626, "step": 8870}, {"loss": 1.0461, "grad_norm": 1.4231666326522827, "learning_rate": 0.0002, "epoch": 7.452790600083928, "step": 8880}, {"loss": 1.0629, "grad_norm": 1.3485242128372192, "learning_rate": 0.0002, "epoch": 7.46118338229123, "step": 8890}, {"loss": 0.9812, "grad_norm": 1.3709967136383057, "learning_rate": 0.0002, "epoch": 7.469576164498531, "step": 8900}, {"loss": 0.9762, "grad_norm": 1.440061330795288, "learning_rate": 0.0002, "epoch": 7.477968946705833, "step": 8910}, {"loss": 0.986, "grad_norm": 1.35463547706604, "learning_rate": 0.0002, "epoch": 7.486361728913135, "step": 8920}, {"loss": 1.0676, "grad_norm": 1.4464876651763916, "learning_rate": 0.0002, "epoch": 7.494754511120436, "step": 8930}, {"loss": 0.9964, "grad_norm": 1.3082282543182373, "learning_rate": 0.0002, "epoch": 7.503147293327738, "step": 8940}, {"loss": 1.0189, "grad_norm": 1.5687413215637207, "learning_rate": 0.0002, "epoch": 7.51154007553504, "step": 8950}, {"loss": 0.9964, "grad_norm": 1.3017815351486206, "learning_rate": 0.0002, "epoch": 7.519932857742342, "step": 8960}, {"loss": 1.1089, "grad_norm": 1.3839282989501953, "learning_rate": 0.0002, "epoch": 7.528325639949643, "step": 8970}, {"loss": 1.0415, "grad_norm": 1.4667741060256958, "learning_rate": 0.0002, "epoch": 7.5367184221569445, "step": 8980}, {"loss": 0.9877, "grad_norm": 1.3954358100891113, "learning_rate": 0.0002, "epoch": 7.545111204364247, "step": 8990}, {"loss": 0.9216, "grad_norm": 1.2745059728622437, "learning_rate": 0.0002, "epoch": 7.553503986571548, "step": 9000}, {"loss": 0.9868, "grad_norm": 1.3012958765029907, "learning_rate": 0.0002, "epoch": 7.56189676877885, "step": 9010}, {"loss": 0.9691, "grad_norm": 1.4432767629623413, "learning_rate": 0.0002, "epoch": 7.570289550986152, "step": 9020}, {"loss": 1.0086, "grad_norm": 1.3510358333587646, "learning_rate": 0.0002, "epoch": 7.578682333193454, "step": 9030}, {"loss": 1.0167, "grad_norm": 1.331549048423767, "learning_rate": 0.0002, "epoch": 7.587075115400755, "step": 9040}, {"loss": 0.9904, "grad_norm": 1.4031989574432373, "learning_rate": 0.0002, "epoch": 7.595467897608057, "step": 9050}, {"loss": 0.9486, "grad_norm": 1.3684027194976807, "learning_rate": 0.0002, "epoch": 7.603860679815359, "step": 9060}, {"loss": 1.0284, "grad_norm": 1.5346373319625854, "learning_rate": 0.0002, "epoch": 7.612253462022661, "step": 9070}, {"loss": 0.9843, "grad_norm": 1.4921435117721558, "learning_rate": 0.0002, "epoch": 7.620646244229962, "step": 9080}, {"loss": 0.9853, "grad_norm": 1.3445239067077637, "learning_rate": 0.0002, "epoch": 7.629039026437264, "step": 9090}, {"loss": 1.0377, "grad_norm": 1.4929054975509644, "learning_rate": 0.0002, "epoch": 7.637431808644566, "step": 9100}, {"loss": 0.9422, "grad_norm": 1.3410874605178833, "learning_rate": 0.0002, "epoch": 7.645824590851867, "step": 9110}, {"loss": 1.0323, "grad_norm": 1.343114972114563, "learning_rate": 0.0002, "epoch": 7.654217373059169, "step": 9120}, {"loss": 0.9945, "grad_norm": 1.424418568611145, "learning_rate": 0.0002, "epoch": 7.662610155266471, "step": 9130}, {"loss": 0.9923, "grad_norm": 1.3746715784072876, "learning_rate": 0.0002, "epoch": 7.671002937473773, "step": 9140}, {"loss": 1.053, "grad_norm": 1.1734800338745117, "learning_rate": 0.0002, "epoch": 7.679395719681074, "step": 9150}, {"loss": 1.0328, "grad_norm": 1.4013954401016235, "learning_rate": 0.0002, "epoch": 7.687788501888376, "step": 9160}, {"loss": 1.0566, "grad_norm": 1.3568707704544067, "learning_rate": 0.0002, "epoch": 7.696181284095678, "step": 9170}, {"loss": 1.0157, "grad_norm": 1.3949618339538574, "learning_rate": 0.0002, "epoch": 7.704574066302979, "step": 9180}, {"loss": 1.0468, "grad_norm": 1.4103217124938965, "learning_rate": 0.0002, "epoch": 7.712966848510281, "step": 9190}, {"loss": 0.9251, "grad_norm": 1.3260635137557983, "learning_rate": 0.0002, "epoch": 7.721359630717583, "step": 9200}, {"loss": 1.035, "grad_norm": 1.316851019859314, "learning_rate": 0.0002, "epoch": 7.729752412924885, "step": 9210}, {"loss": 1.0313, "grad_norm": 1.2649954557418823, "learning_rate": 0.0002, "epoch": 7.738145195132186, "step": 9220}, {"loss": 1.0451, "grad_norm": 1.2904008626937866, "learning_rate": 0.0002, "epoch": 7.746537977339488, "step": 9230}, {"loss": 0.997, "grad_norm": 1.6231776475906372, "learning_rate": 0.0002, "epoch": 7.75493075954679, "step": 9240}, {"loss": 1.0586, "grad_norm": 1.4072569608688354, "learning_rate": 0.0002, "epoch": 7.763323541754091, "step": 9250}, {"loss": 0.982, "grad_norm": 1.4019498825073242, "learning_rate": 0.0002, "epoch": 7.771716323961393, "step": 9260}, {"loss": 1.0308, "grad_norm": 1.354575276374817, "learning_rate": 0.0002, "epoch": 7.780109106168695, "step": 9270}, {"loss": 0.9984, "grad_norm": 1.1940326690673828, "learning_rate": 0.0002, "epoch": 7.788501888375997, "step": 9280}, {"loss": 1.004, "grad_norm": 1.5169446468353271, "learning_rate": 0.0002, "epoch": 7.796894670583298, "step": 9290}, {"loss": 1.0822, "grad_norm": 1.5126844644546509, "learning_rate": 0.0002, "epoch": 7.8052874527906, "step": 9300}, {"loss": 1.0647, "grad_norm": 1.3362282514572144, "learning_rate": 0.0002, "epoch": 7.813680234997902, "step": 9310}, {"loss": 1.0294, "grad_norm": 1.505102515220642, "learning_rate": 0.0002, "epoch": 7.822073017205204, "step": 9320}, {"loss": 1.0402, "grad_norm": 1.3281409740447998, "learning_rate": 0.0002, "epoch": 7.830465799412505, "step": 9330}, {"loss": 1.0316, "grad_norm": 1.6044951677322388, "learning_rate": 0.0002, "epoch": 7.838858581619807, "step": 9340}, {"loss": 1.0579, "grad_norm": 1.4066485166549683, "learning_rate": 0.0002, "epoch": 7.847251363827109, "step": 9350}, {"loss": 1.0726, "grad_norm": 1.3862172365188599, "learning_rate": 0.0002, "epoch": 7.85564414603441, "step": 9360}, {"loss": 1.0363, "grad_norm": 1.6576231718063354, "learning_rate": 0.0002, "epoch": 7.864036928241712, "step": 9370}, {"loss": 1.0022, "grad_norm": 1.6516666412353516, "learning_rate": 0.0002, "epoch": 7.872429710449014, "step": 9380}, {"loss": 1.0372, "grad_norm": 1.4599813222885132, "learning_rate": 0.0002, "epoch": 7.880822492656316, "step": 9390}, {"loss": 1.0576, "grad_norm": 1.3877774477005005, "learning_rate": 0.0002, "epoch": 7.889215274863617, "step": 9400}, {"loss": 1.0389, "grad_norm": 1.3922977447509766, "learning_rate": 0.0002, "epoch": 7.897608057070919, "step": 9410}, {"loss": 1.0022, "grad_norm": 1.368686556816101, "learning_rate": 0.0002, "epoch": 7.906000839278221, "step": 9420}, {"loss": 1.0892, "grad_norm": 1.4226235151290894, "learning_rate": 0.0002, "epoch": 7.914393621485522, "step": 9430}, {"loss": 1.053, "grad_norm": 1.629234790802002, "learning_rate": 0.0002, "epoch": 7.922786403692824, "step": 9440}, {"loss": 1.0277, "grad_norm": 1.5644806623458862, "learning_rate": 0.0002, "epoch": 7.931179185900126, "step": 9450}, {"loss": 1.0567, "grad_norm": 1.1915444135665894, "learning_rate": 0.0002, "epoch": 7.939571968107428, "step": 9460}, {"loss": 1.073, "grad_norm": 1.3066319227218628, "learning_rate": 0.0002, "epoch": 7.947964750314729, "step": 9470}, {"loss": 1.0097, "grad_norm": 1.2318781614303589, "learning_rate": 0.0002, "epoch": 7.956357532522031, "step": 9480}, {"loss": 1.0836, "grad_norm": 1.558817982673645, "learning_rate": 0.0002, "epoch": 7.964750314729333, "step": 9490}, {"loss": 1.0311, "grad_norm": 1.2839301824569702, "learning_rate": 0.0002, "epoch": 7.973143096936634, "step": 9500}, {"loss": 1.0475, "grad_norm": 1.2938915491104126, "learning_rate": 0.0002, "epoch": 7.981535879143936, "step": 9510}, {"loss": 1.0254, "grad_norm": 1.4090218544006348, "learning_rate": 0.0002, "epoch": 7.989928661351238, "step": 9520}]}