diff --git a/.gitattributes b/.gitattributes index bb6d6818313615dca2d1f9334ad8e05c52e5860b..6588b867c9fbe6faa78ccb296be5b69f79cdf709 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1183,3 +1183,12 @@ gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1 gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-150-sd-10000/checkpoint-48/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-150-sd-10000/checkpoint-6/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_flare-fiqasa_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-150-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c89b2573ef09c6f37942634ae12f39b577ab7dd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fc54ab35d13f4b28ffa8c99f20c0db3d91e7323f68e725af2c8a6e54d45f622 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c89b2573ef09c6f37942634ae12f39b577ab7dd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fc54ab35d13f4b28ffa8c99f20c0db3d91e7323f68e725af2c8a6e54d45f622 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c075b2002ce6afd2539ed8428b7a532648a0ae59 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e872836526651f9f209f12f325c7a7806cfe788a5f0cc0ac2df9e5a02696621 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5bd59cdab5095c93135780dcf0bac2de50c123a0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bea4a8e7e67d80ee514e98886c8f16f52e932c57fa7264878754bbbd833b7a31 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5365741a7ce7a1cbae5419fe7819aa9229e9355e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffc2bf64d8fde4d5db26327467b14671c96072921161522d09bb8570cdf724a3 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2e8646c68cc3f87fe39fe3455731e184164984f8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/trainer_state.json @@ -0,0 +1,882 @@ +{ + "best_metric": 1.8182536363601685, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.6550514698028564, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 600 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.36824166774749756, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 610 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.34707099199295044, + "learning_rate": 0.0002, + "loss": 1.7684, + "step": 620 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.38599663972854614, + "learning_rate": 0.0002, + "loss": 1.7552, + "step": 630 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.34381693601608276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 640 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.3657481372356415, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 650 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.3310803771018982, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 660 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.37122875452041626, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 670 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.3976633548736572, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 680 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.37567615509033203, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 690 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.3683645725250244, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 700 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.3862009644508362, + "learning_rate": 0.0002, + "loss": 1.8074, + "step": 710 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.3478439450263977, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 720 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.3694932162761688, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 730 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.3661787211894989, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 740 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.372951865196228, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 750 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.38718998432159424, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 760 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.37488260865211487, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 770 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.34794917702674866, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 780 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.3627476990222931, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 790 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.3773096799850464, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 800 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.36476725339889526, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 810 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.3767942190170288, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 820 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 830 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4008622169494629, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 840 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.4029707610607147, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 850 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.41480565071105957, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 860 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.4351646900177002, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 870 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.4053232967853546, + "learning_rate": 0.0002, + "loss": 1.8436, + "step": 880 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.3515186607837677, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 890 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.42895469069480896, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 900 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.40897831320762634, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 910 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.3544739782810211, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 920 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.3848305642604828, + "learning_rate": 0.0002, + "loss": 1.7596, + "step": 930 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.36952173709869385, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 940 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.36505743861198425, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 950 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.3707764446735382, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 960 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 970 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.35458096861839294, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 980 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.3557756841182709, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 990 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 0.355899453163147, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 1000 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.3709148168563843, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1010 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.3731614947319031, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1020 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.3639261722564697, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 1030 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.36371079087257385, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1040 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.38235539197921753, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1050 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.4109364151954651, + "learning_rate": 0.0002, + "loss": 1.7304, + "step": 1060 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.3499647378921509, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1070 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.3892260193824768, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1080 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.3545094132423401, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 1090 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.40419837832450867, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 1100 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.38423678278923035, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 1110 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.378408282995224, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 1120 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.4071941077709198, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1130 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.42363739013671875, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 1140 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.37373560667037964, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 1150 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.36408767104148865, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1160 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.3795453906059265, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1170 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.34415504336357117, + "learning_rate": 0.0002, + "loss": 1.7726, + "step": 1180 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.3491021394729614, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1190 + }, + { + "epoch": 2.0, + "eval_loss": 1.8182536363601685, + "eval_runtime": 87.8767, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.74, + "step": 1198 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.544075756935578e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8834b3661464eef52e70e134be888ee23bd03eeb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cfafbe2e54dce755785501eea86255490b40ae13766e36fbbb2e98ff1300300 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..39031d62f9d45e3ab5044591325a1505adb170a0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9023eb228d5ac172a597a3c52983c2e00ac0c37f24181f227fea577960b69c46 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..df39b7c7e4900ccea0235130a7962e16b10bcfc6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ea6c6af453659b278bc8dda0303baa95ac327f14e19435dd17c914be5b0461a +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d377feca24c85a2bde17fcb7872e095a67e9227 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57b84ab53b8b016556e5c471248db54a7b5fb2ab750167e4f885b03aff346794 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d9b776f07cd593d883390b41c23129535ff309f4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/trainer_state.json @@ -0,0 +1,1310 @@ +{ + "best_metric": 1.8182536363601685, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 1797, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.6550514698028564, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 600 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.36824166774749756, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 610 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.34707099199295044, + "learning_rate": 0.0002, + "loss": 1.7684, + "step": 620 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.38599663972854614, + "learning_rate": 0.0002, + "loss": 1.7552, + "step": 630 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.34381693601608276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 640 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.3657481372356415, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 650 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.3310803771018982, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 660 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.37122875452041626, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 670 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.3976633548736572, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 680 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.37567615509033203, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 690 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.3683645725250244, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 700 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.3862009644508362, + "learning_rate": 0.0002, + "loss": 1.8074, + "step": 710 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.3478439450263977, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 720 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.3694932162761688, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 730 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.3661787211894989, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 740 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.372951865196228, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 750 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.38718998432159424, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 760 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.37488260865211487, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 770 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.34794917702674866, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 780 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.3627476990222931, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 790 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.3773096799850464, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 800 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.36476725339889526, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 810 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.3767942190170288, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 820 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 830 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4008622169494629, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 840 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.4029707610607147, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 850 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.41480565071105957, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 860 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.4351646900177002, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 870 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.4053232967853546, + "learning_rate": 0.0002, + "loss": 1.8436, + "step": 880 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.3515186607837677, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 890 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.42895469069480896, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 900 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.40897831320762634, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 910 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.3544739782810211, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 920 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.3848305642604828, + "learning_rate": 0.0002, + "loss": 1.7596, + "step": 930 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.36952173709869385, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 940 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.36505743861198425, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 950 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.3707764446735382, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 960 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 970 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.35458096861839294, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 980 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.3557756841182709, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 990 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 0.355899453163147, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 1000 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.3709148168563843, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1010 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.3731614947319031, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1020 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.3639261722564697, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 1030 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.36371079087257385, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1040 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.38235539197921753, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1050 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.4109364151954651, + "learning_rate": 0.0002, + "loss": 1.7304, + "step": 1060 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.3499647378921509, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1070 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.3892260193824768, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1080 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.3545094132423401, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 1090 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.40419837832450867, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 1100 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.38423678278923035, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 1110 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.378408282995224, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 1120 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.4071941077709198, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1130 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.42363739013671875, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 1140 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.37373560667037964, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 1150 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.36408767104148865, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1160 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.3795453906059265, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1170 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.34415504336357117, + "learning_rate": 0.0002, + "loss": 1.7726, + "step": 1180 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.3491021394729614, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1190 + }, + { + "epoch": 2.0, + "eval_loss": 1.8182536363601685, + "eval_runtime": 87.8767, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.74, + "step": 1198 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.36758512258529663, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 1200 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.36278557777404785, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1210 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.4186977744102478, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 1220 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.3958706855773926, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 1230 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.43305638432502747, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1240 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.4509678781032562, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 1250 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.4297264516353607, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1260 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.4579504132270813, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1270 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.4223267138004303, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1280 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 0.41538703441619873, + "learning_rate": 0.0002, + "loss": 1.6472, + "step": 1290 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.4987374544143677, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 1300 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.45300114154815674, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1310 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.4577588737010956, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1320 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.4110747277736664, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1330 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5107163190841675, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1340 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.41190820932388306, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 1350 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.47458386421203613, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 1360 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.42136940360069275, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1370 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.48292383551597595, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 1380 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.4519229531288147, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 1390 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.5860922336578369, + "learning_rate": 0.0002, + "loss": 1.6408, + "step": 1400 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.4362313747406006, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 1410 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.46916621923446655, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1420 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.5249663591384888, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1430 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.4764375388622284, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1440 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.46573784947395325, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1450 + }, + { + "epoch": 2.4373956594323873, + "grad_norm": 0.44539371132850647, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.4540901502504173, + "grad_norm": 0.40925896167755127, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 1470 + }, + { + "epoch": 2.4707846410684473, + "grad_norm": 0.4431462287902832, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 1480 + }, + { + "epoch": 2.4874791318864773, + "grad_norm": 0.5476022362709045, + "learning_rate": 0.0002, + "loss": 1.692, + "step": 1490 + }, + { + "epoch": 2.5041736227045073, + "grad_norm": 0.44762539863586426, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1500 + }, + { + "epoch": 2.5208681135225377, + "grad_norm": 0.5470041632652283, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1510 + }, + { + "epoch": 2.5375626043405677, + "grad_norm": 0.4739997088909149, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 1520 + }, + { + "epoch": 2.5542570951585977, + "grad_norm": 0.47115322947502136, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1530 + }, + { + "epoch": 2.5709515859766277, + "grad_norm": 0.49705708026885986, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1540 + }, + { + "epoch": 2.5876460767946576, + "grad_norm": 0.5537301301956177, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1550 + }, + { + "epoch": 2.604340567612688, + "grad_norm": 0.46930626034736633, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 1560 + }, + { + "epoch": 2.621035058430718, + "grad_norm": 0.42371469736099243, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 1570 + }, + { + "epoch": 2.637729549248748, + "grad_norm": 0.49005603790283203, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1580 + }, + { + "epoch": 2.654424040066778, + "grad_norm": 0.4646829068660736, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1590 + }, + { + "epoch": 2.671118530884808, + "grad_norm": 0.5091238617897034, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1600 + }, + { + "epoch": 2.687813021702838, + "grad_norm": 0.4889985918998718, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 1610 + }, + { + "epoch": 2.704507512520868, + "grad_norm": 0.5128234624862671, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1620 + }, + { + "epoch": 2.721202003338898, + "grad_norm": 0.46999186277389526, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1630 + }, + { + "epoch": 2.7378964941569284, + "grad_norm": 0.4949921667575836, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1640 + }, + { + "epoch": 2.7545909849749584, + "grad_norm": 0.4484370946884155, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1650 + }, + { + "epoch": 2.7712854757929883, + "grad_norm": 0.45599570870399475, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1660 + }, + { + "epoch": 2.7879799666110183, + "grad_norm": 0.5093285441398621, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1670 + }, + { + "epoch": 2.8046744574290483, + "grad_norm": 0.44737935066223145, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1680 + }, + { + "epoch": 2.8213689482470787, + "grad_norm": 0.4374251365661621, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 1690 + }, + { + "epoch": 2.8380634390651087, + "grad_norm": 0.44765740633010864, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1700 + }, + { + "epoch": 2.8547579298831387, + "grad_norm": 0.44685253500938416, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 2.8714524207011687, + "grad_norm": 0.44777143001556396, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 1720 + }, + { + "epoch": 2.8881469115191987, + "grad_norm": 0.4178132712841034, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 1730 + }, + { + "epoch": 2.9048414023372287, + "grad_norm": 0.4487852156162262, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 1740 + }, + { + "epoch": 2.9215358931552586, + "grad_norm": 0.47137337923049927, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 1750 + }, + { + "epoch": 2.9382303839732886, + "grad_norm": 0.48543235659599304, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1760 + }, + { + "epoch": 2.9549248747913186, + "grad_norm": 0.4174182116985321, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1770 + }, + { + "epoch": 2.971619365609349, + "grad_norm": 0.43385711312294006, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 1780 + }, + { + "epoch": 2.988313856427379, + "grad_norm": 0.474332332611084, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 1790 + }, + { + "epoch": 3.0, + "eval_loss": 1.8456445932388306, + "eval_runtime": 87.6261, + "eval_samples_per_second": 5.877, + "eval_steps_per_second": 0.742, + "step": 1797 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.316113635403366e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1797/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8d54d992cb9b6a4c34170c6298563e85c17672f0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7cc9fca2733a1aa9e8a24d12bfafeb33816e532d46f26d6a124e48cee37073 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5791808549569930c042b901e4d07a6ecd2824ab --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29e025bff57ae0383efa78511d542bdfad18c92611e898a75604fc1f02a0870d +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac387c47bc376b76a4207867bc804bfbfaac9ce7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2caf9686e72f2d99753b3f12a0c4895855368b371595b7dd4d0d3e0672998126 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..87ad14a6532c10ec71312a7cececa1fe10bd248b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98fe9c6c555dae3a3bdf3813f204237057dfa5e96c9d907e021dcc9d788270e +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..22053a32b8b718aace33705d42a760bb7ef2db9a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/trainer_state.json @@ -0,0 +1,1738 @@ +{ + "best_metric": 1.8182536363601685, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 2396, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.6550514698028564, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 600 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.36824166774749756, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 610 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.34707099199295044, + "learning_rate": 0.0002, + "loss": 1.7684, + "step": 620 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.38599663972854614, + "learning_rate": 0.0002, + "loss": 1.7552, + "step": 630 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.34381693601608276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 640 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.3657481372356415, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 650 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.3310803771018982, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 660 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.37122875452041626, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 670 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.3976633548736572, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 680 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.37567615509033203, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 690 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.3683645725250244, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 700 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.3862009644508362, + "learning_rate": 0.0002, + "loss": 1.8074, + "step": 710 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.3478439450263977, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 720 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.3694932162761688, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 730 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.3661787211894989, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 740 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.372951865196228, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 750 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.38718998432159424, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 760 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.37488260865211487, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 770 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.34794917702674866, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 780 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.3627476990222931, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 790 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.3773096799850464, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 800 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.36476725339889526, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 810 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.3767942190170288, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 820 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 830 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4008622169494629, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 840 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.4029707610607147, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 850 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.41480565071105957, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 860 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.4351646900177002, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 870 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.4053232967853546, + "learning_rate": 0.0002, + "loss": 1.8436, + "step": 880 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.3515186607837677, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 890 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.42895469069480896, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 900 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.40897831320762634, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 910 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.3544739782810211, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 920 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.3848305642604828, + "learning_rate": 0.0002, + "loss": 1.7596, + "step": 930 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.36952173709869385, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 940 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.36505743861198425, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 950 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.3707764446735382, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 960 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 970 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.35458096861839294, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 980 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.3557756841182709, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 990 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 0.355899453163147, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 1000 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.3709148168563843, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1010 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.3731614947319031, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1020 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.3639261722564697, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 1030 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.36371079087257385, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1040 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.38235539197921753, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1050 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.4109364151954651, + "learning_rate": 0.0002, + "loss": 1.7304, + "step": 1060 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.3499647378921509, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1070 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.3892260193824768, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1080 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.3545094132423401, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 1090 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.40419837832450867, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 1100 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.38423678278923035, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 1110 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.378408282995224, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 1120 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.4071941077709198, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1130 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.42363739013671875, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 1140 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.37373560667037964, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 1150 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.36408767104148865, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1160 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.3795453906059265, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1170 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.34415504336357117, + "learning_rate": 0.0002, + "loss": 1.7726, + "step": 1180 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.3491021394729614, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1190 + }, + { + "epoch": 2.0, + "eval_loss": 1.8182536363601685, + "eval_runtime": 87.8767, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.74, + "step": 1198 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.36758512258529663, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 1200 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.36278557777404785, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1210 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.4186977744102478, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 1220 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.3958706855773926, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 1230 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.43305638432502747, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1240 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.4509678781032562, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 1250 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.4297264516353607, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1260 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.4579504132270813, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1270 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.4223267138004303, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1280 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 0.41538703441619873, + "learning_rate": 0.0002, + "loss": 1.6472, + "step": 1290 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.4987374544143677, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 1300 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.45300114154815674, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1310 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.4577588737010956, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1320 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.4110747277736664, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1330 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5107163190841675, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1340 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.41190820932388306, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 1350 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.47458386421203613, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 1360 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.42136940360069275, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1370 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.48292383551597595, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 1380 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.4519229531288147, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 1390 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.5860922336578369, + "learning_rate": 0.0002, + "loss": 1.6408, + "step": 1400 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.4362313747406006, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 1410 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.46916621923446655, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1420 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.5249663591384888, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1430 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.4764375388622284, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1440 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.46573784947395325, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1450 + }, + { + "epoch": 2.4373956594323873, + "grad_norm": 0.44539371132850647, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.4540901502504173, + "grad_norm": 0.40925896167755127, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 1470 + }, + { + "epoch": 2.4707846410684473, + "grad_norm": 0.4431462287902832, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 1480 + }, + { + "epoch": 2.4874791318864773, + "grad_norm": 0.5476022362709045, + "learning_rate": 0.0002, + "loss": 1.692, + "step": 1490 + }, + { + "epoch": 2.5041736227045073, + "grad_norm": 0.44762539863586426, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1500 + }, + { + "epoch": 2.5208681135225377, + "grad_norm": 0.5470041632652283, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1510 + }, + { + "epoch": 2.5375626043405677, + "grad_norm": 0.4739997088909149, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 1520 + }, + { + "epoch": 2.5542570951585977, + "grad_norm": 0.47115322947502136, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1530 + }, + { + "epoch": 2.5709515859766277, + "grad_norm": 0.49705708026885986, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1540 + }, + { + "epoch": 2.5876460767946576, + "grad_norm": 0.5537301301956177, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1550 + }, + { + "epoch": 2.604340567612688, + "grad_norm": 0.46930626034736633, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 1560 + }, + { + "epoch": 2.621035058430718, + "grad_norm": 0.42371469736099243, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 1570 + }, + { + "epoch": 2.637729549248748, + "grad_norm": 0.49005603790283203, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1580 + }, + { + "epoch": 2.654424040066778, + "grad_norm": 0.4646829068660736, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1590 + }, + { + "epoch": 2.671118530884808, + "grad_norm": 0.5091238617897034, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1600 + }, + { + "epoch": 2.687813021702838, + "grad_norm": 0.4889985918998718, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 1610 + }, + { + "epoch": 2.704507512520868, + "grad_norm": 0.5128234624862671, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1620 + }, + { + "epoch": 2.721202003338898, + "grad_norm": 0.46999186277389526, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1630 + }, + { + "epoch": 2.7378964941569284, + "grad_norm": 0.4949921667575836, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1640 + }, + { + "epoch": 2.7545909849749584, + "grad_norm": 0.4484370946884155, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1650 + }, + { + "epoch": 2.7712854757929883, + "grad_norm": 0.45599570870399475, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1660 + }, + { + "epoch": 2.7879799666110183, + "grad_norm": 0.5093285441398621, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1670 + }, + { + "epoch": 2.8046744574290483, + "grad_norm": 0.44737935066223145, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1680 + }, + { + "epoch": 2.8213689482470787, + "grad_norm": 0.4374251365661621, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 1690 + }, + { + "epoch": 2.8380634390651087, + "grad_norm": 0.44765740633010864, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1700 + }, + { + "epoch": 2.8547579298831387, + "grad_norm": 0.44685253500938416, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 2.8714524207011687, + "grad_norm": 0.44777143001556396, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 1720 + }, + { + "epoch": 2.8881469115191987, + "grad_norm": 0.4178132712841034, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 1730 + }, + { + "epoch": 2.9048414023372287, + "grad_norm": 0.4487852156162262, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 1740 + }, + { + "epoch": 2.9215358931552586, + "grad_norm": 0.47137337923049927, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 1750 + }, + { + "epoch": 2.9382303839732886, + "grad_norm": 0.48543235659599304, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1760 + }, + { + "epoch": 2.9549248747913186, + "grad_norm": 0.4174182116985321, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1770 + }, + { + "epoch": 2.971619365609349, + "grad_norm": 0.43385711312294006, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 1780 + }, + { + "epoch": 2.988313856427379, + "grad_norm": 0.474332332611084, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 1790 + }, + { + "epoch": 3.0, + "eval_loss": 1.8456445932388306, + "eval_runtime": 87.6261, + "eval_samples_per_second": 5.877, + "eval_steps_per_second": 0.742, + "step": 1797 + }, + { + "epoch": 3.005008347245409, + "grad_norm": 0.40323764085769653, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 1800 + }, + { + "epoch": 3.021702838063439, + "grad_norm": 0.45069044828414917, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 1810 + }, + { + "epoch": 3.038397328881469, + "grad_norm": 0.6204925775527954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 1820 + }, + { + "epoch": 3.0550918196994994, + "grad_norm": 0.5857783555984497, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 1830 + }, + { + "epoch": 3.0717863105175294, + "grad_norm": 0.6776524782180786, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 1840 + }, + { + "epoch": 3.0884808013355594, + "grad_norm": 0.5486199855804443, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1850 + }, + { + "epoch": 3.1051752921535893, + "grad_norm": 0.5496503710746765, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1860 + }, + { + "epoch": 3.1218697829716193, + "grad_norm": 0.5602648258209229, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 1870 + }, + { + "epoch": 3.1385642737896493, + "grad_norm": 1.0697380304336548, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1880 + }, + { + "epoch": 3.1552587646076793, + "grad_norm": 0.6087332367897034, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 1890 + }, + { + "epoch": 3.1719532554257097, + "grad_norm": 0.5112161040306091, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 1900 + }, + { + "epoch": 3.1886477462437397, + "grad_norm": 0.6393680572509766, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 1910 + }, + { + "epoch": 3.2053422370617697, + "grad_norm": 0.7201815247535706, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1920 + }, + { + "epoch": 3.2220367278797997, + "grad_norm": 0.5856018662452698, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 1930 + }, + { + "epoch": 3.2387312186978297, + "grad_norm": 0.581247866153717, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 1940 + }, + { + "epoch": 3.2554257095158596, + "grad_norm": 0.6055102944374084, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 1950 + }, + { + "epoch": 3.27212020033389, + "grad_norm": 0.546894371509552, + "learning_rate": 0.0002, + "loss": 1.5086, + "step": 1960 + }, + { + "epoch": 3.28881469115192, + "grad_norm": 0.565558910369873, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 1970 + }, + { + "epoch": 3.30550918196995, + "grad_norm": 1.2238883972167969, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 1980 + }, + { + "epoch": 3.32220367278798, + "grad_norm": 0.6362585425376892, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 1990 + }, + { + "epoch": 3.33889816360601, + "grad_norm": 0.6131124496459961, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 2000 + }, + { + "epoch": 3.35559265442404, + "grad_norm": 0.5181341767311096, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2010 + }, + { + "epoch": 3.37228714524207, + "grad_norm": 0.6667609810829163, + "learning_rate": 0.0002, + "loss": 1.5039, + "step": 2020 + }, + { + "epoch": 3.3889816360601, + "grad_norm": 0.6488749980926514, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 2030 + }, + { + "epoch": 3.4056761268781304, + "grad_norm": 0.5693286061286926, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 2040 + }, + { + "epoch": 3.4223706176961604, + "grad_norm": 0.6154143810272217, + "learning_rate": 0.0002, + "loss": 1.5121, + "step": 2050 + }, + { + "epoch": 3.4390651085141903, + "grad_norm": 0.6747981309890747, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2060 + }, + { + "epoch": 3.4557595993322203, + "grad_norm": 0.5494789481163025, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2070 + }, + { + "epoch": 3.4724540901502503, + "grad_norm": 2.481968402862549, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2080 + }, + { + "epoch": 3.4891485809682803, + "grad_norm": 0.589784562587738, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2090 + }, + { + "epoch": 3.5058430717863107, + "grad_norm": 0.6449820399284363, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 2100 + }, + { + "epoch": 3.5225375626043407, + "grad_norm": 0.6467038989067078, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2110 + }, + { + "epoch": 3.5392320534223707, + "grad_norm": 0.6533533334732056, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2120 + }, + { + "epoch": 3.5559265442404007, + "grad_norm": 0.6804035902023315, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2130 + }, + { + "epoch": 3.5726210350584306, + "grad_norm": 0.628773033618927, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2140 + }, + { + "epoch": 3.5893155258764606, + "grad_norm": 0.6055739521980286, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 2150 + }, + { + "epoch": 3.6060100166944906, + "grad_norm": 0.6000894904136658, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 2160 + }, + { + "epoch": 3.6227045075125206, + "grad_norm": 0.5862473249435425, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 2170 + }, + { + "epoch": 3.639398998330551, + "grad_norm": 0.6547419428825378, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 2180 + }, + { + "epoch": 3.656093489148581, + "grad_norm": 0.5610318779945374, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 2190 + }, + { + "epoch": 3.672787979966611, + "grad_norm": 0.6387564539909363, + "learning_rate": 0.0002, + "loss": 1.4814, + "step": 2200 + }, + { + "epoch": 3.689482470784641, + "grad_norm": 0.6065090894699097, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 2210 + }, + { + "epoch": 3.706176961602671, + "grad_norm": 0.6266646981239319, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2220 + }, + { + "epoch": 3.7228714524207014, + "grad_norm": 0.626944363117218, + "learning_rate": 0.0002, + "loss": 1.5146, + "step": 2230 + }, + { + "epoch": 3.7395659432387314, + "grad_norm": 0.6043975949287415, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 2240 + }, + { + "epoch": 3.7562604340567614, + "grad_norm": 0.599732518196106, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 2250 + }, + { + "epoch": 3.7729549248747913, + "grad_norm": 0.6738389134407043, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 2260 + }, + { + "epoch": 3.7896494156928213, + "grad_norm": 0.5561335682868958, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 2270 + }, + { + "epoch": 3.8063439065108513, + "grad_norm": 0.6185726523399353, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2280 + }, + { + "epoch": 3.8230383973288813, + "grad_norm": 0.6151532530784607, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 2290 + }, + { + "epoch": 3.8397328881469113, + "grad_norm": 0.5808233022689819, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 2300 + }, + { + "epoch": 3.8564273789649417, + "grad_norm": 0.6615163683891296, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2310 + }, + { + "epoch": 3.8731218697829717, + "grad_norm": 0.5832979679107666, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 2320 + }, + { + "epoch": 3.8898163606010017, + "grad_norm": 0.6119300127029419, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2330 + }, + { + "epoch": 3.9065108514190316, + "grad_norm": 0.6489697694778442, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 2340 + }, + { + "epoch": 3.9232053422370616, + "grad_norm": 0.5539063215255737, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2350 + }, + { + "epoch": 3.939899833055092, + "grad_norm": 0.6062877178192139, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2360 + }, + { + "epoch": 3.956594323873122, + "grad_norm": 0.680609941482544, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2370 + }, + { + "epoch": 3.973288814691152, + "grad_norm": 0.6176834106445312, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 2380 + }, + { + "epoch": 3.989983305509182, + "grad_norm": 0.6538102030754089, + "learning_rate": 0.0002, + "loss": 1.4984, + "step": 2390 + }, + { + "epoch": 4.0, + "eval_loss": 1.8920671939849854, + "eval_runtime": 76.5227, + "eval_samples_per_second": 6.73, + "eval_steps_per_second": 0.849, + "step": 2396 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1088151513871155e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2396/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4a7d3bde230c154f3208d9a9f128136a3879ba71 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5d8a094f89ef511bd01eac5f5a44a62ee185fa345dabc566691af268a3ac990 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..58b074ffb7e5a118eeb4fa4aa7fabee5f3ef08ab --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca6ec7bfa01f491d5e1b42a017607bc7cf2c7f4db35ebca1f5f45b7ca8b97f85 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c24ce7f2e84f99ff4029805c486e1bb8337420f8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14624519710bb8cc1311e6cc781b5a33ed02202d402078e40f4aa5315b0e7fcc +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c821fb13f8044ca23be02f5b9fb6d14041055951 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d52afad77475f7419648c9086512c38d0f7c88c906ef4783e76885c190568b1 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..85e9171738b0d96d3b4bd67f4db2bd47a3745227 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/trainer_state.json @@ -0,0 +1,2166 @@ +{ + "best_metric": 1.8182536363601685, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 2995, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.6550514698028564, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 600 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.36824166774749756, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 610 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.34707099199295044, + "learning_rate": 0.0002, + "loss": 1.7684, + "step": 620 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.38599663972854614, + "learning_rate": 0.0002, + "loss": 1.7552, + "step": 630 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.34381693601608276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 640 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.3657481372356415, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 650 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.3310803771018982, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 660 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.37122875452041626, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 670 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.3976633548736572, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 680 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.37567615509033203, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 690 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.3683645725250244, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 700 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.3862009644508362, + "learning_rate": 0.0002, + "loss": 1.8074, + "step": 710 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.3478439450263977, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 720 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.3694932162761688, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 730 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.3661787211894989, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 740 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.372951865196228, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 750 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.38718998432159424, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 760 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.37488260865211487, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 770 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.34794917702674866, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 780 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.3627476990222931, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 790 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.3773096799850464, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 800 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.36476725339889526, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 810 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.3767942190170288, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 820 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 830 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4008622169494629, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 840 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.4029707610607147, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 850 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.41480565071105957, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 860 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.4351646900177002, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 870 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.4053232967853546, + "learning_rate": 0.0002, + "loss": 1.8436, + "step": 880 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.3515186607837677, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 890 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.42895469069480896, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 900 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.40897831320762634, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 910 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.3544739782810211, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 920 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.3848305642604828, + "learning_rate": 0.0002, + "loss": 1.7596, + "step": 930 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.36952173709869385, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 940 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.36505743861198425, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 950 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.3707764446735382, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 960 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 970 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.35458096861839294, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 980 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.3557756841182709, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 990 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 0.355899453163147, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 1000 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.3709148168563843, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1010 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.3731614947319031, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1020 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.3639261722564697, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 1030 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.36371079087257385, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1040 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.38235539197921753, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1050 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.4109364151954651, + "learning_rate": 0.0002, + "loss": 1.7304, + "step": 1060 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.3499647378921509, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1070 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.3892260193824768, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1080 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.3545094132423401, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 1090 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.40419837832450867, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 1100 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.38423678278923035, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 1110 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.378408282995224, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 1120 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.4071941077709198, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1130 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.42363739013671875, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 1140 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.37373560667037964, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 1150 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.36408767104148865, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1160 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.3795453906059265, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1170 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.34415504336357117, + "learning_rate": 0.0002, + "loss": 1.7726, + "step": 1180 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.3491021394729614, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1190 + }, + { + "epoch": 2.0, + "eval_loss": 1.8182536363601685, + "eval_runtime": 87.8767, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.74, + "step": 1198 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.36758512258529663, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 1200 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.36278557777404785, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1210 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.4186977744102478, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 1220 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.3958706855773926, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 1230 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.43305638432502747, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1240 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.4509678781032562, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 1250 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.4297264516353607, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1260 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.4579504132270813, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1270 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.4223267138004303, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1280 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 0.41538703441619873, + "learning_rate": 0.0002, + "loss": 1.6472, + "step": 1290 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.4987374544143677, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 1300 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.45300114154815674, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1310 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.4577588737010956, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1320 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.4110747277736664, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1330 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5107163190841675, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1340 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.41190820932388306, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 1350 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.47458386421203613, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 1360 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.42136940360069275, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1370 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.48292383551597595, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 1380 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.4519229531288147, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 1390 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.5860922336578369, + "learning_rate": 0.0002, + "loss": 1.6408, + "step": 1400 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.4362313747406006, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 1410 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.46916621923446655, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1420 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.5249663591384888, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1430 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.4764375388622284, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1440 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.46573784947395325, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1450 + }, + { + "epoch": 2.4373956594323873, + "grad_norm": 0.44539371132850647, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.4540901502504173, + "grad_norm": 0.40925896167755127, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 1470 + }, + { + "epoch": 2.4707846410684473, + "grad_norm": 0.4431462287902832, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 1480 + }, + { + "epoch": 2.4874791318864773, + "grad_norm": 0.5476022362709045, + "learning_rate": 0.0002, + "loss": 1.692, + "step": 1490 + }, + { + "epoch": 2.5041736227045073, + "grad_norm": 0.44762539863586426, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1500 + }, + { + "epoch": 2.5208681135225377, + "grad_norm": 0.5470041632652283, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1510 + }, + { + "epoch": 2.5375626043405677, + "grad_norm": 0.4739997088909149, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 1520 + }, + { + "epoch": 2.5542570951585977, + "grad_norm": 0.47115322947502136, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1530 + }, + { + "epoch": 2.5709515859766277, + "grad_norm": 0.49705708026885986, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1540 + }, + { + "epoch": 2.5876460767946576, + "grad_norm": 0.5537301301956177, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1550 + }, + { + "epoch": 2.604340567612688, + "grad_norm": 0.46930626034736633, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 1560 + }, + { + "epoch": 2.621035058430718, + "grad_norm": 0.42371469736099243, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 1570 + }, + { + "epoch": 2.637729549248748, + "grad_norm": 0.49005603790283203, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1580 + }, + { + "epoch": 2.654424040066778, + "grad_norm": 0.4646829068660736, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1590 + }, + { + "epoch": 2.671118530884808, + "grad_norm": 0.5091238617897034, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1600 + }, + { + "epoch": 2.687813021702838, + "grad_norm": 0.4889985918998718, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 1610 + }, + { + "epoch": 2.704507512520868, + "grad_norm": 0.5128234624862671, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1620 + }, + { + "epoch": 2.721202003338898, + "grad_norm": 0.46999186277389526, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1630 + }, + { + "epoch": 2.7378964941569284, + "grad_norm": 0.4949921667575836, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1640 + }, + { + "epoch": 2.7545909849749584, + "grad_norm": 0.4484370946884155, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1650 + }, + { + "epoch": 2.7712854757929883, + "grad_norm": 0.45599570870399475, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1660 + }, + { + "epoch": 2.7879799666110183, + "grad_norm": 0.5093285441398621, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1670 + }, + { + "epoch": 2.8046744574290483, + "grad_norm": 0.44737935066223145, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1680 + }, + { + "epoch": 2.8213689482470787, + "grad_norm": 0.4374251365661621, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 1690 + }, + { + "epoch": 2.8380634390651087, + "grad_norm": 0.44765740633010864, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1700 + }, + { + "epoch": 2.8547579298831387, + "grad_norm": 0.44685253500938416, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 2.8714524207011687, + "grad_norm": 0.44777143001556396, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 1720 + }, + { + "epoch": 2.8881469115191987, + "grad_norm": 0.4178132712841034, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 1730 + }, + { + "epoch": 2.9048414023372287, + "grad_norm": 0.4487852156162262, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 1740 + }, + { + "epoch": 2.9215358931552586, + "grad_norm": 0.47137337923049927, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 1750 + }, + { + "epoch": 2.9382303839732886, + "grad_norm": 0.48543235659599304, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1760 + }, + { + "epoch": 2.9549248747913186, + "grad_norm": 0.4174182116985321, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1770 + }, + { + "epoch": 2.971619365609349, + "grad_norm": 0.43385711312294006, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 1780 + }, + { + "epoch": 2.988313856427379, + "grad_norm": 0.474332332611084, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 1790 + }, + { + "epoch": 3.0, + "eval_loss": 1.8456445932388306, + "eval_runtime": 87.6261, + "eval_samples_per_second": 5.877, + "eval_steps_per_second": 0.742, + "step": 1797 + }, + { + "epoch": 3.005008347245409, + "grad_norm": 0.40323764085769653, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 1800 + }, + { + "epoch": 3.021702838063439, + "grad_norm": 0.45069044828414917, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 1810 + }, + { + "epoch": 3.038397328881469, + "grad_norm": 0.6204925775527954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 1820 + }, + { + "epoch": 3.0550918196994994, + "grad_norm": 0.5857783555984497, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 1830 + }, + { + "epoch": 3.0717863105175294, + "grad_norm": 0.6776524782180786, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 1840 + }, + { + "epoch": 3.0884808013355594, + "grad_norm": 0.5486199855804443, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1850 + }, + { + "epoch": 3.1051752921535893, + "grad_norm": 0.5496503710746765, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1860 + }, + { + "epoch": 3.1218697829716193, + "grad_norm": 0.5602648258209229, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 1870 + }, + { + "epoch": 3.1385642737896493, + "grad_norm": 1.0697380304336548, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1880 + }, + { + "epoch": 3.1552587646076793, + "grad_norm": 0.6087332367897034, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 1890 + }, + { + "epoch": 3.1719532554257097, + "grad_norm": 0.5112161040306091, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 1900 + }, + { + "epoch": 3.1886477462437397, + "grad_norm": 0.6393680572509766, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 1910 + }, + { + "epoch": 3.2053422370617697, + "grad_norm": 0.7201815247535706, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1920 + }, + { + "epoch": 3.2220367278797997, + "grad_norm": 0.5856018662452698, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 1930 + }, + { + "epoch": 3.2387312186978297, + "grad_norm": 0.581247866153717, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 1940 + }, + { + "epoch": 3.2554257095158596, + "grad_norm": 0.6055102944374084, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 1950 + }, + { + "epoch": 3.27212020033389, + "grad_norm": 0.546894371509552, + "learning_rate": 0.0002, + "loss": 1.5086, + "step": 1960 + }, + { + "epoch": 3.28881469115192, + "grad_norm": 0.565558910369873, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 1970 + }, + { + "epoch": 3.30550918196995, + "grad_norm": 1.2238883972167969, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 1980 + }, + { + "epoch": 3.32220367278798, + "grad_norm": 0.6362585425376892, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 1990 + }, + { + "epoch": 3.33889816360601, + "grad_norm": 0.6131124496459961, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 2000 + }, + { + "epoch": 3.35559265442404, + "grad_norm": 0.5181341767311096, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2010 + }, + { + "epoch": 3.37228714524207, + "grad_norm": 0.6667609810829163, + "learning_rate": 0.0002, + "loss": 1.5039, + "step": 2020 + }, + { + "epoch": 3.3889816360601, + "grad_norm": 0.6488749980926514, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 2030 + }, + { + "epoch": 3.4056761268781304, + "grad_norm": 0.5693286061286926, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 2040 + }, + { + "epoch": 3.4223706176961604, + "grad_norm": 0.6154143810272217, + "learning_rate": 0.0002, + "loss": 1.5121, + "step": 2050 + }, + { + "epoch": 3.4390651085141903, + "grad_norm": 0.6747981309890747, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2060 + }, + { + "epoch": 3.4557595993322203, + "grad_norm": 0.5494789481163025, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2070 + }, + { + "epoch": 3.4724540901502503, + "grad_norm": 2.481968402862549, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2080 + }, + { + "epoch": 3.4891485809682803, + "grad_norm": 0.589784562587738, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2090 + }, + { + "epoch": 3.5058430717863107, + "grad_norm": 0.6449820399284363, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 2100 + }, + { + "epoch": 3.5225375626043407, + "grad_norm": 0.6467038989067078, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2110 + }, + { + "epoch": 3.5392320534223707, + "grad_norm": 0.6533533334732056, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2120 + }, + { + "epoch": 3.5559265442404007, + "grad_norm": 0.6804035902023315, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2130 + }, + { + "epoch": 3.5726210350584306, + "grad_norm": 0.628773033618927, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2140 + }, + { + "epoch": 3.5893155258764606, + "grad_norm": 0.6055739521980286, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 2150 + }, + { + "epoch": 3.6060100166944906, + "grad_norm": 0.6000894904136658, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 2160 + }, + { + "epoch": 3.6227045075125206, + "grad_norm": 0.5862473249435425, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 2170 + }, + { + "epoch": 3.639398998330551, + "grad_norm": 0.6547419428825378, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 2180 + }, + { + "epoch": 3.656093489148581, + "grad_norm": 0.5610318779945374, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 2190 + }, + { + "epoch": 3.672787979966611, + "grad_norm": 0.6387564539909363, + "learning_rate": 0.0002, + "loss": 1.4814, + "step": 2200 + }, + { + "epoch": 3.689482470784641, + "grad_norm": 0.6065090894699097, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 2210 + }, + { + "epoch": 3.706176961602671, + "grad_norm": 0.6266646981239319, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2220 + }, + { + "epoch": 3.7228714524207014, + "grad_norm": 0.626944363117218, + "learning_rate": 0.0002, + "loss": 1.5146, + "step": 2230 + }, + { + "epoch": 3.7395659432387314, + "grad_norm": 0.6043975949287415, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 2240 + }, + { + "epoch": 3.7562604340567614, + "grad_norm": 0.599732518196106, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 2250 + }, + { + "epoch": 3.7729549248747913, + "grad_norm": 0.6738389134407043, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 2260 + }, + { + "epoch": 3.7896494156928213, + "grad_norm": 0.5561335682868958, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 2270 + }, + { + "epoch": 3.8063439065108513, + "grad_norm": 0.6185726523399353, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2280 + }, + { + "epoch": 3.8230383973288813, + "grad_norm": 0.6151532530784607, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 2290 + }, + { + "epoch": 3.8397328881469113, + "grad_norm": 0.5808233022689819, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 2300 + }, + { + "epoch": 3.8564273789649417, + "grad_norm": 0.6615163683891296, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2310 + }, + { + "epoch": 3.8731218697829717, + "grad_norm": 0.5832979679107666, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 2320 + }, + { + "epoch": 3.8898163606010017, + "grad_norm": 0.6119300127029419, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2330 + }, + { + "epoch": 3.9065108514190316, + "grad_norm": 0.6489697694778442, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 2340 + }, + { + "epoch": 3.9232053422370616, + "grad_norm": 0.5539063215255737, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2350 + }, + { + "epoch": 3.939899833055092, + "grad_norm": 0.6062877178192139, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2360 + }, + { + "epoch": 3.956594323873122, + "grad_norm": 0.680609941482544, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2370 + }, + { + "epoch": 3.973288814691152, + "grad_norm": 0.6176834106445312, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 2380 + }, + { + "epoch": 3.989983305509182, + "grad_norm": 0.6538102030754089, + "learning_rate": 0.0002, + "loss": 1.4984, + "step": 2390 + }, + { + "epoch": 4.0, + "eval_loss": 1.8920671939849854, + "eval_runtime": 76.5227, + "eval_samples_per_second": 6.73, + "eval_steps_per_second": 0.849, + "step": 2396 + }, + { + "epoch": 4.006677796327212, + "grad_norm": 0.5683762431144714, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2400 + }, + { + "epoch": 4.023372287145242, + "grad_norm": 0.6858044862747192, + "learning_rate": 0.0002, + "loss": 1.3387, + "step": 2410 + }, + { + "epoch": 4.040066777963272, + "grad_norm": 0.7614858150482178, + "learning_rate": 0.0002, + "loss": 1.4495, + "step": 2420 + }, + { + "epoch": 4.056761268781302, + "grad_norm": 0.709412693977356, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 2430 + }, + { + "epoch": 4.073455759599332, + "grad_norm": 0.7070785760879517, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 2440 + }, + { + "epoch": 4.090150250417362, + "grad_norm": 0.8815216422080994, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2450 + }, + { + "epoch": 4.106844741235392, + "grad_norm": 0.759981632232666, + "learning_rate": 0.0002, + "loss": 1.3731, + "step": 2460 + }, + { + "epoch": 4.123539232053423, + "grad_norm": 0.6715240478515625, + "learning_rate": 0.0002, + "loss": 1.3393, + "step": 2470 + }, + { + "epoch": 4.140233722871453, + "grad_norm": 0.7503564953804016, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 2480 + }, + { + "epoch": 4.156928213689483, + "grad_norm": 0.773743748664856, + "learning_rate": 0.0002, + "loss": 1.324, + "step": 2490 + }, + { + "epoch": 4.173622704507513, + "grad_norm": 0.8850100040435791, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 2500 + }, + { + "epoch": 4.190317195325543, + "grad_norm": 0.7575962543487549, + "learning_rate": 0.0002, + "loss": 1.3183, + "step": 2510 + }, + { + "epoch": 4.207011686143573, + "grad_norm": 0.9117498397827148, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 2520 + }, + { + "epoch": 4.223706176961603, + "grad_norm": 0.7637559175491333, + "learning_rate": 0.0002, + "loss": 1.3242, + "step": 2530 + }, + { + "epoch": 4.240400667779633, + "grad_norm": 0.8178390264511108, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2540 + }, + { + "epoch": 4.257095158597663, + "grad_norm": 0.8299263119697571, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2550 + }, + { + "epoch": 4.273789649415693, + "grad_norm": 0.7238091230392456, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2560 + }, + { + "epoch": 4.290484140233723, + "grad_norm": 0.7468036413192749, + "learning_rate": 0.0002, + "loss": 1.349, + "step": 2570 + }, + { + "epoch": 4.307178631051753, + "grad_norm": 0.8012791275978088, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 2580 + }, + { + "epoch": 4.323873121869783, + "grad_norm": 0.8302484154701233, + "learning_rate": 0.0002, + "loss": 1.3723, + "step": 2590 + }, + { + "epoch": 4.340567612687813, + "grad_norm": 0.751864492893219, + "learning_rate": 0.0002, + "loss": 1.4013, + "step": 2600 + }, + { + "epoch": 4.357262103505843, + "grad_norm": 0.8025410175323486, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2610 + }, + { + "epoch": 4.373956594323873, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 2620 + }, + { + "epoch": 4.390651085141903, + "grad_norm": 0.8526890873908997, + "learning_rate": 0.0002, + "loss": 1.3721, + "step": 2630 + }, + { + "epoch": 4.407345575959933, + "grad_norm": 1.0536625385284424, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2640 + }, + { + "epoch": 4.424040066777963, + "grad_norm": 0.7223818898200989, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 2650 + }, + { + "epoch": 4.440734557595993, + "grad_norm": 0.7981253266334534, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2660 + }, + { + "epoch": 4.457429048414023, + "grad_norm": 0.7136162519454956, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2670 + }, + { + "epoch": 4.474123539232053, + "grad_norm": 0.8008312582969666, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 2680 + }, + { + "epoch": 4.490818030050083, + "grad_norm": 0.7924065589904785, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 2690 + }, + { + "epoch": 4.507512520868113, + "grad_norm": 0.8224287629127502, + "learning_rate": 0.0002, + "loss": 1.402, + "step": 2700 + }, + { + "epoch": 4.524207011686143, + "grad_norm": 0.7494375109672546, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 2710 + }, + { + "epoch": 4.540901502504173, + "grad_norm": 0.8097899556159973, + "learning_rate": 0.0002, + "loss": 1.4471, + "step": 2720 + }, + { + "epoch": 4.557595993322204, + "grad_norm": 0.7728819251060486, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2730 + }, + { + "epoch": 4.574290484140234, + "grad_norm": 0.9112362265586853, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 2740 + }, + { + "epoch": 4.590984974958264, + "grad_norm": 0.7502672076225281, + "learning_rate": 0.0002, + "loss": 1.4601, + "step": 2750 + }, + { + "epoch": 4.607679465776294, + "grad_norm": 0.8816406726837158, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 2760 + }, + { + "epoch": 4.624373956594324, + "grad_norm": 0.7117180228233337, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2770 + }, + { + "epoch": 4.641068447412354, + "grad_norm": 0.8224529027938843, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2780 + }, + { + "epoch": 4.657762938230384, + "grad_norm": 0.7625266313552856, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 2790 + }, + { + "epoch": 4.674457429048414, + "grad_norm": 0.7754318118095398, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2800 + }, + { + "epoch": 4.691151919866444, + "grad_norm": 0.7907336354255676, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 2810 + }, + { + "epoch": 4.707846410684474, + "grad_norm": 0.7377734780311584, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2820 + }, + { + "epoch": 4.724540901502504, + "grad_norm": 0.7380456328392029, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 2830 + }, + { + "epoch": 4.741235392320534, + "grad_norm": 0.7148023247718811, + "learning_rate": 0.0002, + "loss": 1.4405, + "step": 2840 + }, + { + "epoch": 4.757929883138564, + "grad_norm": 0.807048499584198, + "learning_rate": 0.0002, + "loss": 1.4025, + "step": 2850 + }, + { + "epoch": 4.774624373956595, + "grad_norm": 0.8444154858589172, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 2860 + }, + { + "epoch": 4.791318864774624, + "grad_norm": 0.8328704237937927, + "learning_rate": 0.0002, + "loss": 1.4282, + "step": 2870 + }, + { + "epoch": 4.808013355592655, + "grad_norm": 0.89827960729599, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 2880 + }, + { + "epoch": 4.824707846410685, + "grad_norm": 0.7848225831985474, + "learning_rate": 0.0002, + "loss": 1.4488, + "step": 2890 + }, + { + "epoch": 4.841402337228715, + "grad_norm": 0.703802227973938, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 2900 + }, + { + "epoch": 4.858096828046745, + "grad_norm": 0.8092581629753113, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 2910 + }, + { + "epoch": 4.874791318864775, + "grad_norm": 0.7537722587585449, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 2920 + }, + { + "epoch": 4.891485809682805, + "grad_norm": 0.7966470122337341, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 2930 + }, + { + "epoch": 4.908180300500835, + "grad_norm": 0.7860329747200012, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 2940 + }, + { + "epoch": 4.924874791318865, + "grad_norm": 0.7964439988136292, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 2950 + }, + { + "epoch": 4.941569282136895, + "grad_norm": 0.740288257598877, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 2960 + }, + { + "epoch": 4.958263772954925, + "grad_norm": 0.7377685904502869, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 2970 + }, + { + "epoch": 4.974958263772955, + "grad_norm": 0.793484628200531, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2980 + }, + { + "epoch": 4.9916527545909855, + "grad_norm": 0.7710573077201843, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 2990 + }, + { + "epoch": 5.0, + "eval_loss": 1.9764225482940674, + "eval_runtime": 87.968, + "eval_samples_per_second": 5.854, + "eval_steps_per_second": 0.739, + "step": 2995 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3860189392338944e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-2995/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b382dd3843f35651348b00183e8522b985d8f81 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e29bd99a49268bf0c1f3991666aa09fc664161709985be9830a46e30e94eb05 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c654c9526d5d6ca84e040fa99b78dcaee092e2a8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c67f2b62f799001c9ccbb049f3bc911e5a6fc5eb297a905255d0bad2fe83ab83 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd3f1f1c85f9aa65023addff0b23217337095799 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dacd8927bd65e13e10a1f5129cfe8456775541599915935bdbed140236909458 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..220f3c7c321b915db101ad99ab595568db23bbff --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c8c1b8a42fc8171360d9478d8960bcef0c1e4a71f2faaca0ffa0bc1d91ec249 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..327767bd2ad2c03ac0141272b1c83928a15850d4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/trainer_state.json @@ -0,0 +1,2594 @@ +{ + "best_metric": 1.8182536363601685, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 3594, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.6550514698028564, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 600 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.36824166774749756, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 610 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.34707099199295044, + "learning_rate": 0.0002, + "loss": 1.7684, + "step": 620 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.38599663972854614, + "learning_rate": 0.0002, + "loss": 1.7552, + "step": 630 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.34381693601608276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 640 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.3657481372356415, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 650 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.3310803771018982, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 660 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.37122875452041626, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 670 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.3976633548736572, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 680 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.37567615509033203, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 690 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.3683645725250244, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 700 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.3862009644508362, + "learning_rate": 0.0002, + "loss": 1.8074, + "step": 710 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.3478439450263977, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 720 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.3694932162761688, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 730 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.3661787211894989, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 740 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.372951865196228, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 750 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.38718998432159424, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 760 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.37488260865211487, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 770 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.34794917702674866, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 780 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.3627476990222931, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 790 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.3773096799850464, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 800 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.36476725339889526, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 810 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.3767942190170288, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 820 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 830 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4008622169494629, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 840 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.4029707610607147, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 850 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.41480565071105957, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 860 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.4351646900177002, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 870 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.4053232967853546, + "learning_rate": 0.0002, + "loss": 1.8436, + "step": 880 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.3515186607837677, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 890 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.42895469069480896, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 900 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.40897831320762634, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 910 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.3544739782810211, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 920 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.3848305642604828, + "learning_rate": 0.0002, + "loss": 1.7596, + "step": 930 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.36952173709869385, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 940 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.36505743861198425, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 950 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.3707764446735382, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 960 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 970 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.35458096861839294, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 980 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.3557756841182709, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 990 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 0.355899453163147, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 1000 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.3709148168563843, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1010 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.3731614947319031, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1020 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.3639261722564697, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 1030 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.36371079087257385, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1040 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.38235539197921753, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1050 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.4109364151954651, + "learning_rate": 0.0002, + "loss": 1.7304, + "step": 1060 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.3499647378921509, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1070 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.3892260193824768, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1080 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.3545094132423401, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 1090 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.40419837832450867, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 1100 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.38423678278923035, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 1110 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.378408282995224, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 1120 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.4071941077709198, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1130 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.42363739013671875, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 1140 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.37373560667037964, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 1150 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.36408767104148865, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1160 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.3795453906059265, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1170 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.34415504336357117, + "learning_rate": 0.0002, + "loss": 1.7726, + "step": 1180 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.3491021394729614, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1190 + }, + { + "epoch": 2.0, + "eval_loss": 1.8182536363601685, + "eval_runtime": 87.8767, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.74, + "step": 1198 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.36758512258529663, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 1200 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.36278557777404785, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1210 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.4186977744102478, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 1220 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.3958706855773926, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 1230 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.43305638432502747, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1240 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.4509678781032562, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 1250 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.4297264516353607, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1260 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.4579504132270813, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1270 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.4223267138004303, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1280 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 0.41538703441619873, + "learning_rate": 0.0002, + "loss": 1.6472, + "step": 1290 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.4987374544143677, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 1300 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.45300114154815674, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1310 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.4577588737010956, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1320 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.4110747277736664, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1330 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5107163190841675, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1340 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.41190820932388306, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 1350 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.47458386421203613, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 1360 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.42136940360069275, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1370 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.48292383551597595, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 1380 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.4519229531288147, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 1390 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.5860922336578369, + "learning_rate": 0.0002, + "loss": 1.6408, + "step": 1400 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.4362313747406006, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 1410 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.46916621923446655, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1420 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.5249663591384888, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1430 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.4764375388622284, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1440 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.46573784947395325, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1450 + }, + { + "epoch": 2.4373956594323873, + "grad_norm": 0.44539371132850647, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.4540901502504173, + "grad_norm": 0.40925896167755127, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 1470 + }, + { + "epoch": 2.4707846410684473, + "grad_norm": 0.4431462287902832, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 1480 + }, + { + "epoch": 2.4874791318864773, + "grad_norm": 0.5476022362709045, + "learning_rate": 0.0002, + "loss": 1.692, + "step": 1490 + }, + { + "epoch": 2.5041736227045073, + "grad_norm": 0.44762539863586426, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1500 + }, + { + "epoch": 2.5208681135225377, + "grad_norm": 0.5470041632652283, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1510 + }, + { + "epoch": 2.5375626043405677, + "grad_norm": 0.4739997088909149, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 1520 + }, + { + "epoch": 2.5542570951585977, + "grad_norm": 0.47115322947502136, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1530 + }, + { + "epoch": 2.5709515859766277, + "grad_norm": 0.49705708026885986, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1540 + }, + { + "epoch": 2.5876460767946576, + "grad_norm": 0.5537301301956177, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1550 + }, + { + "epoch": 2.604340567612688, + "grad_norm": 0.46930626034736633, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 1560 + }, + { + "epoch": 2.621035058430718, + "grad_norm": 0.42371469736099243, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 1570 + }, + { + "epoch": 2.637729549248748, + "grad_norm": 0.49005603790283203, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1580 + }, + { + "epoch": 2.654424040066778, + "grad_norm": 0.4646829068660736, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1590 + }, + { + "epoch": 2.671118530884808, + "grad_norm": 0.5091238617897034, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1600 + }, + { + "epoch": 2.687813021702838, + "grad_norm": 0.4889985918998718, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 1610 + }, + { + "epoch": 2.704507512520868, + "grad_norm": 0.5128234624862671, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1620 + }, + { + "epoch": 2.721202003338898, + "grad_norm": 0.46999186277389526, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1630 + }, + { + "epoch": 2.7378964941569284, + "grad_norm": 0.4949921667575836, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1640 + }, + { + "epoch": 2.7545909849749584, + "grad_norm": 0.4484370946884155, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1650 + }, + { + "epoch": 2.7712854757929883, + "grad_norm": 0.45599570870399475, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1660 + }, + { + "epoch": 2.7879799666110183, + "grad_norm": 0.5093285441398621, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1670 + }, + { + "epoch": 2.8046744574290483, + "grad_norm": 0.44737935066223145, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1680 + }, + { + "epoch": 2.8213689482470787, + "grad_norm": 0.4374251365661621, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 1690 + }, + { + "epoch": 2.8380634390651087, + "grad_norm": 0.44765740633010864, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1700 + }, + { + "epoch": 2.8547579298831387, + "grad_norm": 0.44685253500938416, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 2.8714524207011687, + "grad_norm": 0.44777143001556396, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 1720 + }, + { + "epoch": 2.8881469115191987, + "grad_norm": 0.4178132712841034, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 1730 + }, + { + "epoch": 2.9048414023372287, + "grad_norm": 0.4487852156162262, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 1740 + }, + { + "epoch": 2.9215358931552586, + "grad_norm": 0.47137337923049927, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 1750 + }, + { + "epoch": 2.9382303839732886, + "grad_norm": 0.48543235659599304, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1760 + }, + { + "epoch": 2.9549248747913186, + "grad_norm": 0.4174182116985321, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1770 + }, + { + "epoch": 2.971619365609349, + "grad_norm": 0.43385711312294006, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 1780 + }, + { + "epoch": 2.988313856427379, + "grad_norm": 0.474332332611084, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 1790 + }, + { + "epoch": 3.0, + "eval_loss": 1.8456445932388306, + "eval_runtime": 87.6261, + "eval_samples_per_second": 5.877, + "eval_steps_per_second": 0.742, + "step": 1797 + }, + { + "epoch": 3.005008347245409, + "grad_norm": 0.40323764085769653, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 1800 + }, + { + "epoch": 3.021702838063439, + "grad_norm": 0.45069044828414917, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 1810 + }, + { + "epoch": 3.038397328881469, + "grad_norm": 0.6204925775527954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 1820 + }, + { + "epoch": 3.0550918196994994, + "grad_norm": 0.5857783555984497, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 1830 + }, + { + "epoch": 3.0717863105175294, + "grad_norm": 0.6776524782180786, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 1840 + }, + { + "epoch": 3.0884808013355594, + "grad_norm": 0.5486199855804443, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1850 + }, + { + "epoch": 3.1051752921535893, + "grad_norm": 0.5496503710746765, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1860 + }, + { + "epoch": 3.1218697829716193, + "grad_norm": 0.5602648258209229, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 1870 + }, + { + "epoch": 3.1385642737896493, + "grad_norm": 1.0697380304336548, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1880 + }, + { + "epoch": 3.1552587646076793, + "grad_norm": 0.6087332367897034, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 1890 + }, + { + "epoch": 3.1719532554257097, + "grad_norm": 0.5112161040306091, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 1900 + }, + { + "epoch": 3.1886477462437397, + "grad_norm": 0.6393680572509766, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 1910 + }, + { + "epoch": 3.2053422370617697, + "grad_norm": 0.7201815247535706, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1920 + }, + { + "epoch": 3.2220367278797997, + "grad_norm": 0.5856018662452698, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 1930 + }, + { + "epoch": 3.2387312186978297, + "grad_norm": 0.581247866153717, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 1940 + }, + { + "epoch": 3.2554257095158596, + "grad_norm": 0.6055102944374084, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 1950 + }, + { + "epoch": 3.27212020033389, + "grad_norm": 0.546894371509552, + "learning_rate": 0.0002, + "loss": 1.5086, + "step": 1960 + }, + { + "epoch": 3.28881469115192, + "grad_norm": 0.565558910369873, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 1970 + }, + { + "epoch": 3.30550918196995, + "grad_norm": 1.2238883972167969, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 1980 + }, + { + "epoch": 3.32220367278798, + "grad_norm": 0.6362585425376892, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 1990 + }, + { + "epoch": 3.33889816360601, + "grad_norm": 0.6131124496459961, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 2000 + }, + { + "epoch": 3.35559265442404, + "grad_norm": 0.5181341767311096, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2010 + }, + { + "epoch": 3.37228714524207, + "grad_norm": 0.6667609810829163, + "learning_rate": 0.0002, + "loss": 1.5039, + "step": 2020 + }, + { + "epoch": 3.3889816360601, + "grad_norm": 0.6488749980926514, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 2030 + }, + { + "epoch": 3.4056761268781304, + "grad_norm": 0.5693286061286926, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 2040 + }, + { + "epoch": 3.4223706176961604, + "grad_norm": 0.6154143810272217, + "learning_rate": 0.0002, + "loss": 1.5121, + "step": 2050 + }, + { + "epoch": 3.4390651085141903, + "grad_norm": 0.6747981309890747, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2060 + }, + { + "epoch": 3.4557595993322203, + "grad_norm": 0.5494789481163025, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2070 + }, + { + "epoch": 3.4724540901502503, + "grad_norm": 2.481968402862549, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2080 + }, + { + "epoch": 3.4891485809682803, + "grad_norm": 0.589784562587738, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2090 + }, + { + "epoch": 3.5058430717863107, + "grad_norm": 0.6449820399284363, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 2100 + }, + { + "epoch": 3.5225375626043407, + "grad_norm": 0.6467038989067078, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2110 + }, + { + "epoch": 3.5392320534223707, + "grad_norm": 0.6533533334732056, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2120 + }, + { + "epoch": 3.5559265442404007, + "grad_norm": 0.6804035902023315, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2130 + }, + { + "epoch": 3.5726210350584306, + "grad_norm": 0.628773033618927, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2140 + }, + { + "epoch": 3.5893155258764606, + "grad_norm": 0.6055739521980286, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 2150 + }, + { + "epoch": 3.6060100166944906, + "grad_norm": 0.6000894904136658, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 2160 + }, + { + "epoch": 3.6227045075125206, + "grad_norm": 0.5862473249435425, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 2170 + }, + { + "epoch": 3.639398998330551, + "grad_norm": 0.6547419428825378, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 2180 + }, + { + "epoch": 3.656093489148581, + "grad_norm": 0.5610318779945374, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 2190 + }, + { + "epoch": 3.672787979966611, + "grad_norm": 0.6387564539909363, + "learning_rate": 0.0002, + "loss": 1.4814, + "step": 2200 + }, + { + "epoch": 3.689482470784641, + "grad_norm": 0.6065090894699097, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 2210 + }, + { + "epoch": 3.706176961602671, + "grad_norm": 0.6266646981239319, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2220 + }, + { + "epoch": 3.7228714524207014, + "grad_norm": 0.626944363117218, + "learning_rate": 0.0002, + "loss": 1.5146, + "step": 2230 + }, + { + "epoch": 3.7395659432387314, + "grad_norm": 0.6043975949287415, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 2240 + }, + { + "epoch": 3.7562604340567614, + "grad_norm": 0.599732518196106, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 2250 + }, + { + "epoch": 3.7729549248747913, + "grad_norm": 0.6738389134407043, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 2260 + }, + { + "epoch": 3.7896494156928213, + "grad_norm": 0.5561335682868958, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 2270 + }, + { + "epoch": 3.8063439065108513, + "grad_norm": 0.6185726523399353, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2280 + }, + { + "epoch": 3.8230383973288813, + "grad_norm": 0.6151532530784607, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 2290 + }, + { + "epoch": 3.8397328881469113, + "grad_norm": 0.5808233022689819, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 2300 + }, + { + "epoch": 3.8564273789649417, + "grad_norm": 0.6615163683891296, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2310 + }, + { + "epoch": 3.8731218697829717, + "grad_norm": 0.5832979679107666, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 2320 + }, + { + "epoch": 3.8898163606010017, + "grad_norm": 0.6119300127029419, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2330 + }, + { + "epoch": 3.9065108514190316, + "grad_norm": 0.6489697694778442, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 2340 + }, + { + "epoch": 3.9232053422370616, + "grad_norm": 0.5539063215255737, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2350 + }, + { + "epoch": 3.939899833055092, + "grad_norm": 0.6062877178192139, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2360 + }, + { + "epoch": 3.956594323873122, + "grad_norm": 0.680609941482544, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2370 + }, + { + "epoch": 3.973288814691152, + "grad_norm": 0.6176834106445312, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 2380 + }, + { + "epoch": 3.989983305509182, + "grad_norm": 0.6538102030754089, + "learning_rate": 0.0002, + "loss": 1.4984, + "step": 2390 + }, + { + "epoch": 4.0, + "eval_loss": 1.8920671939849854, + "eval_runtime": 76.5227, + "eval_samples_per_second": 6.73, + "eval_steps_per_second": 0.849, + "step": 2396 + }, + { + "epoch": 4.006677796327212, + "grad_norm": 0.5683762431144714, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2400 + }, + { + "epoch": 4.023372287145242, + "grad_norm": 0.6858044862747192, + "learning_rate": 0.0002, + "loss": 1.3387, + "step": 2410 + }, + { + "epoch": 4.040066777963272, + "grad_norm": 0.7614858150482178, + "learning_rate": 0.0002, + "loss": 1.4495, + "step": 2420 + }, + { + "epoch": 4.056761268781302, + "grad_norm": 0.709412693977356, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 2430 + }, + { + "epoch": 4.073455759599332, + "grad_norm": 0.7070785760879517, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 2440 + }, + { + "epoch": 4.090150250417362, + "grad_norm": 0.8815216422080994, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2450 + }, + { + "epoch": 4.106844741235392, + "grad_norm": 0.759981632232666, + "learning_rate": 0.0002, + "loss": 1.3731, + "step": 2460 + }, + { + "epoch": 4.123539232053423, + "grad_norm": 0.6715240478515625, + "learning_rate": 0.0002, + "loss": 1.3393, + "step": 2470 + }, + { + "epoch": 4.140233722871453, + "grad_norm": 0.7503564953804016, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 2480 + }, + { + "epoch": 4.156928213689483, + "grad_norm": 0.773743748664856, + "learning_rate": 0.0002, + "loss": 1.324, + "step": 2490 + }, + { + "epoch": 4.173622704507513, + "grad_norm": 0.8850100040435791, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 2500 + }, + { + "epoch": 4.190317195325543, + "grad_norm": 0.7575962543487549, + "learning_rate": 0.0002, + "loss": 1.3183, + "step": 2510 + }, + { + "epoch": 4.207011686143573, + "grad_norm": 0.9117498397827148, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 2520 + }, + { + "epoch": 4.223706176961603, + "grad_norm": 0.7637559175491333, + "learning_rate": 0.0002, + "loss": 1.3242, + "step": 2530 + }, + { + "epoch": 4.240400667779633, + "grad_norm": 0.8178390264511108, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2540 + }, + { + "epoch": 4.257095158597663, + "grad_norm": 0.8299263119697571, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2550 + }, + { + "epoch": 4.273789649415693, + "grad_norm": 0.7238091230392456, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2560 + }, + { + "epoch": 4.290484140233723, + "grad_norm": 0.7468036413192749, + "learning_rate": 0.0002, + "loss": 1.349, + "step": 2570 + }, + { + "epoch": 4.307178631051753, + "grad_norm": 0.8012791275978088, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 2580 + }, + { + "epoch": 4.323873121869783, + "grad_norm": 0.8302484154701233, + "learning_rate": 0.0002, + "loss": 1.3723, + "step": 2590 + }, + { + "epoch": 4.340567612687813, + "grad_norm": 0.751864492893219, + "learning_rate": 0.0002, + "loss": 1.4013, + "step": 2600 + }, + { + "epoch": 4.357262103505843, + "grad_norm": 0.8025410175323486, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2610 + }, + { + "epoch": 4.373956594323873, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 2620 + }, + { + "epoch": 4.390651085141903, + "grad_norm": 0.8526890873908997, + "learning_rate": 0.0002, + "loss": 1.3721, + "step": 2630 + }, + { + "epoch": 4.407345575959933, + "grad_norm": 1.0536625385284424, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2640 + }, + { + "epoch": 4.424040066777963, + "grad_norm": 0.7223818898200989, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 2650 + }, + { + "epoch": 4.440734557595993, + "grad_norm": 0.7981253266334534, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2660 + }, + { + "epoch": 4.457429048414023, + "grad_norm": 0.7136162519454956, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2670 + }, + { + "epoch": 4.474123539232053, + "grad_norm": 0.8008312582969666, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 2680 + }, + { + "epoch": 4.490818030050083, + "grad_norm": 0.7924065589904785, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 2690 + }, + { + "epoch": 4.507512520868113, + "grad_norm": 0.8224287629127502, + "learning_rate": 0.0002, + "loss": 1.402, + "step": 2700 + }, + { + "epoch": 4.524207011686143, + "grad_norm": 0.7494375109672546, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 2710 + }, + { + "epoch": 4.540901502504173, + "grad_norm": 0.8097899556159973, + "learning_rate": 0.0002, + "loss": 1.4471, + "step": 2720 + }, + { + "epoch": 4.557595993322204, + "grad_norm": 0.7728819251060486, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2730 + }, + { + "epoch": 4.574290484140234, + "grad_norm": 0.9112362265586853, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 2740 + }, + { + "epoch": 4.590984974958264, + "grad_norm": 0.7502672076225281, + "learning_rate": 0.0002, + "loss": 1.4601, + "step": 2750 + }, + { + "epoch": 4.607679465776294, + "grad_norm": 0.8816406726837158, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 2760 + }, + { + "epoch": 4.624373956594324, + "grad_norm": 0.7117180228233337, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2770 + }, + { + "epoch": 4.641068447412354, + "grad_norm": 0.8224529027938843, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2780 + }, + { + "epoch": 4.657762938230384, + "grad_norm": 0.7625266313552856, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 2790 + }, + { + "epoch": 4.674457429048414, + "grad_norm": 0.7754318118095398, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2800 + }, + { + "epoch": 4.691151919866444, + "grad_norm": 0.7907336354255676, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 2810 + }, + { + "epoch": 4.707846410684474, + "grad_norm": 0.7377734780311584, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2820 + }, + { + "epoch": 4.724540901502504, + "grad_norm": 0.7380456328392029, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 2830 + }, + { + "epoch": 4.741235392320534, + "grad_norm": 0.7148023247718811, + "learning_rate": 0.0002, + "loss": 1.4405, + "step": 2840 + }, + { + "epoch": 4.757929883138564, + "grad_norm": 0.807048499584198, + "learning_rate": 0.0002, + "loss": 1.4025, + "step": 2850 + }, + { + "epoch": 4.774624373956595, + "grad_norm": 0.8444154858589172, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 2860 + }, + { + "epoch": 4.791318864774624, + "grad_norm": 0.8328704237937927, + "learning_rate": 0.0002, + "loss": 1.4282, + "step": 2870 + }, + { + "epoch": 4.808013355592655, + "grad_norm": 0.89827960729599, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 2880 + }, + { + "epoch": 4.824707846410685, + "grad_norm": 0.7848225831985474, + "learning_rate": 0.0002, + "loss": 1.4488, + "step": 2890 + }, + { + "epoch": 4.841402337228715, + "grad_norm": 0.703802227973938, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 2900 + }, + { + "epoch": 4.858096828046745, + "grad_norm": 0.8092581629753113, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 2910 + }, + { + "epoch": 4.874791318864775, + "grad_norm": 0.7537722587585449, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 2920 + }, + { + "epoch": 4.891485809682805, + "grad_norm": 0.7966470122337341, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 2930 + }, + { + "epoch": 4.908180300500835, + "grad_norm": 0.7860329747200012, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 2940 + }, + { + "epoch": 4.924874791318865, + "grad_norm": 0.7964439988136292, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 2950 + }, + { + "epoch": 4.941569282136895, + "grad_norm": 0.740288257598877, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 2960 + }, + { + "epoch": 4.958263772954925, + "grad_norm": 0.7377685904502869, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 2970 + }, + { + "epoch": 4.974958263772955, + "grad_norm": 0.793484628200531, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2980 + }, + { + "epoch": 4.9916527545909855, + "grad_norm": 0.7710573077201843, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 2990 + }, + { + "epoch": 5.0, + "eval_loss": 1.9764225482940674, + "eval_runtime": 87.968, + "eval_samples_per_second": 5.854, + "eval_steps_per_second": 0.739, + "step": 2995 + }, + { + "epoch": 5.008347245409015, + "grad_norm": 0.680841326713562, + "learning_rate": 0.0002, + "loss": 1.3493, + "step": 3000 + }, + { + "epoch": 5.025041736227045, + "grad_norm": 0.8790825009346008, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 3010 + }, + { + "epoch": 5.041736227045075, + "grad_norm": 1.1519404649734497, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 3020 + }, + { + "epoch": 5.058430717863105, + "grad_norm": 1.1939337253570557, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 3030 + }, + { + "epoch": 5.075125208681135, + "grad_norm": 1.1471049785614014, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 3040 + }, + { + "epoch": 5.091819699499165, + "grad_norm": 1.0808285474777222, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 3050 + }, + { + "epoch": 5.108514190317195, + "grad_norm": 1.0102492570877075, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 3060 + }, + { + "epoch": 5.125208681135225, + "grad_norm": 0.9869397282600403, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 3070 + }, + { + "epoch": 5.141903171953255, + "grad_norm": 0.9689525365829468, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 3080 + }, + { + "epoch": 5.158597662771285, + "grad_norm": 0.9293769598007202, + "learning_rate": 0.0002, + "loss": 1.1747, + "step": 3090 + }, + { + "epoch": 5.175292153589315, + "grad_norm": 0.9289103150367737, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 3100 + }, + { + "epoch": 5.191986644407345, + "grad_norm": 0.9736173152923584, + "learning_rate": 0.0002, + "loss": 1.2538, + "step": 3110 + }, + { + "epoch": 5.208681135225375, + "grad_norm": 1.3144289255142212, + "learning_rate": 0.0002, + "loss": 1.2429, + "step": 3120 + }, + { + "epoch": 5.225375626043405, + "grad_norm": 0.95982825756073, + "learning_rate": 0.0002, + "loss": 1.2107, + "step": 3130 + }, + { + "epoch": 5.242070116861436, + "grad_norm": 0.903189480304718, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 3140 + }, + { + "epoch": 5.258764607679466, + "grad_norm": 1.056692123413086, + "learning_rate": 0.0002, + "loss": 1.2663, + "step": 3150 + }, + { + "epoch": 5.275459098497496, + "grad_norm": 1.1169359683990479, + "learning_rate": 0.0002, + "loss": 1.2955, + "step": 3160 + }, + { + "epoch": 5.292153589315526, + "grad_norm": 1.2178374528884888, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 3170 + }, + { + "epoch": 5.308848080133556, + "grad_norm": 0.9956373572349548, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 3180 + }, + { + "epoch": 5.325542570951586, + "grad_norm": 0.959555447101593, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 3190 + }, + { + "epoch": 5.342237061769616, + "grad_norm": 0.9343846440315247, + "learning_rate": 0.0002, + "loss": 1.1817, + "step": 3200 + }, + { + "epoch": 5.358931552587646, + "grad_norm": 0.8806524872779846, + "learning_rate": 0.0002, + "loss": 1.2033, + "step": 3210 + }, + { + "epoch": 5.375626043405676, + "grad_norm": 0.9477803111076355, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 3220 + }, + { + "epoch": 5.392320534223706, + "grad_norm": 0.9975674152374268, + "learning_rate": 0.0002, + "loss": 1.2011, + "step": 3230 + }, + { + "epoch": 5.409015025041736, + "grad_norm": 0.9650071263313293, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 3240 + }, + { + "epoch": 5.425709515859766, + "grad_norm": 1.0170838832855225, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 3250 + }, + { + "epoch": 5.442404006677796, + "grad_norm": 1.158118486404419, + "learning_rate": 0.0002, + "loss": 1.2635, + "step": 3260 + }, + { + "epoch": 5.459098497495827, + "grad_norm": 1.0228497982025146, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 3270 + }, + { + "epoch": 5.475792988313857, + "grad_norm": 1.0101768970489502, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 3280 + }, + { + "epoch": 5.492487479131887, + "grad_norm": 1.0407295227050781, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 3290 + }, + { + "epoch": 5.509181969949917, + "grad_norm": 0.9337932467460632, + "learning_rate": 0.0002, + "loss": 1.2062, + "step": 3300 + }, + { + "epoch": 5.525876460767947, + "grad_norm": 1.0305527448654175, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 3310 + }, + { + "epoch": 5.542570951585977, + "grad_norm": 1.0523453950881958, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3320 + }, + { + "epoch": 5.559265442404007, + "grad_norm": 0.9707391858100891, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3330 + }, + { + "epoch": 5.575959933222037, + "grad_norm": 1.0054972171783447, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3340 + }, + { + "epoch": 5.592654424040067, + "grad_norm": 1.0393340587615967, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 3350 + }, + { + "epoch": 5.609348914858097, + "grad_norm": 1.0671277046203613, + "learning_rate": 0.0002, + "loss": 1.2328, + "step": 3360 + }, + { + "epoch": 5.626043405676127, + "grad_norm": 1.0725873708724976, + "learning_rate": 0.0002, + "loss": 1.2415, + "step": 3370 + }, + { + "epoch": 5.642737896494157, + "grad_norm": 0.9844746589660645, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 3380 + }, + { + "epoch": 5.659432387312187, + "grad_norm": 0.9659736752510071, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 3390 + }, + { + "epoch": 5.676126878130217, + "grad_norm": 0.9152608513832092, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 3400 + }, + { + "epoch": 5.692821368948247, + "grad_norm": 0.9759509563446045, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 3410 + }, + { + "epoch": 5.709515859766277, + "grad_norm": 1.0662057399749756, + "learning_rate": 0.0002, + "loss": 1.2264, + "step": 3420 + }, + { + "epoch": 5.726210350584307, + "grad_norm": 0.9780185222625732, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 3430 + }, + { + "epoch": 5.742904841402337, + "grad_norm": 0.9781617522239685, + "learning_rate": 0.0002, + "loss": 1.2603, + "step": 3440 + }, + { + "epoch": 5.759599332220367, + "grad_norm": 1.0790785551071167, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 3450 + }, + { + "epoch": 5.776293823038397, + "grad_norm": 1.0573410987854004, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 3460 + }, + { + "epoch": 5.792988313856427, + "grad_norm": 0.9953364729881287, + "learning_rate": 0.0002, + "loss": 1.2591, + "step": 3470 + }, + { + "epoch": 5.809682804674457, + "grad_norm": 1.0072667598724365, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 3480 + }, + { + "epoch": 5.826377295492487, + "grad_norm": 0.9312750697135925, + "learning_rate": 0.0002, + "loss": 1.286, + "step": 3490 + }, + { + "epoch": 5.843071786310517, + "grad_norm": 1.059614896774292, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 3500 + }, + { + "epoch": 5.859766277128547, + "grad_norm": 1.2089484930038452, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 3510 + }, + { + "epoch": 5.876460767946577, + "grad_norm": 1.0740607976913452, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 3520 + }, + { + "epoch": 5.893155258764608, + "grad_norm": 0.9620149731636047, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 3530 + }, + { + "epoch": 5.909849749582638, + "grad_norm": 1.0482431650161743, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 3540 + }, + { + "epoch": 5.926544240400668, + "grad_norm": 0.9137503504753113, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 3550 + }, + { + "epoch": 5.943238731218698, + "grad_norm": 1.1599403619766235, + "learning_rate": 0.0002, + "loss": 1.3066, + "step": 3560 + }, + { + "epoch": 5.959933222036728, + "grad_norm": 0.911613404750824, + "learning_rate": 0.0002, + "loss": 1.2556, + "step": 3570 + }, + { + "epoch": 5.976627712854758, + "grad_norm": 0.9120033383369446, + "learning_rate": 0.0002, + "loss": 1.2746, + "step": 3580 + }, + { + "epoch": 5.993322203672788, + "grad_norm": 1.0588736534118652, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 3590 + }, + { + "epoch": 6.0, + "eval_loss": 2.0921614170074463, + "eval_runtime": 71.974, + "eval_samples_per_second": 7.155, + "eval_steps_per_second": 0.903, + "step": 3594 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6632227270806733e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-3594/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fa1183e890b18a2422358d490df9ae688e45c0f7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55dc2b5ff1f974bc9c104bcd9d00b788925adda2718448cdad64f5fef28c22a6 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..838a048cf58111b5a5b3647128f91648322471a0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:005a2ee98da58f0a4dd900737edf7e9385205230da075564dd26983c33b71671 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fae48d696c38584fa48fccba06fba569dd195273 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bde90dd1d7bed44594b290d8fee1516a42aa77e35eb3167a8d03bc07429da3b3 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b61dcdac404abd0b62546007189a60dc912f1d73 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c83f31c17b6605b5db12586fc045283f30e7011b75dadb4dc4d3bf8c10827afb +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0a5c6c050018a849c44fe058a4a5d8d4239a36a2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/trainer_state.json @@ -0,0 +1,3022 @@ +{ + "best_metric": 1.8182536363601685, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 4193, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.6550514698028564, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 600 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.36824166774749756, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 610 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.34707099199295044, + "learning_rate": 0.0002, + "loss": 1.7684, + "step": 620 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.38599663972854614, + "learning_rate": 0.0002, + "loss": 1.7552, + "step": 630 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.34381693601608276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 640 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.3657481372356415, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 650 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.3310803771018982, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 660 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.37122875452041626, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 670 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.3976633548736572, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 680 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.37567615509033203, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 690 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.3683645725250244, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 700 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.3862009644508362, + "learning_rate": 0.0002, + "loss": 1.8074, + "step": 710 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.3478439450263977, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 720 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.3694932162761688, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 730 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.3661787211894989, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 740 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.372951865196228, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 750 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.38718998432159424, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 760 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.37488260865211487, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 770 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.34794917702674866, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 780 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.3627476990222931, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 790 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.3773096799850464, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 800 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.36476725339889526, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 810 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.3767942190170288, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 820 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 830 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4008622169494629, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 840 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.4029707610607147, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 850 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.41480565071105957, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 860 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.4351646900177002, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 870 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.4053232967853546, + "learning_rate": 0.0002, + "loss": 1.8436, + "step": 880 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.3515186607837677, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 890 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.42895469069480896, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 900 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.40897831320762634, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 910 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.3544739782810211, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 920 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.3848305642604828, + "learning_rate": 0.0002, + "loss": 1.7596, + "step": 930 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.36952173709869385, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 940 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.36505743861198425, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 950 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.3707764446735382, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 960 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 970 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.35458096861839294, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 980 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.3557756841182709, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 990 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 0.355899453163147, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 1000 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.3709148168563843, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1010 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.3731614947319031, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1020 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.3639261722564697, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 1030 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.36371079087257385, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1040 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.38235539197921753, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1050 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.4109364151954651, + "learning_rate": 0.0002, + "loss": 1.7304, + "step": 1060 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.3499647378921509, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1070 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.3892260193824768, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1080 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.3545094132423401, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 1090 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.40419837832450867, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 1100 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.38423678278923035, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 1110 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.378408282995224, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 1120 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.4071941077709198, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1130 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.42363739013671875, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 1140 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.37373560667037964, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 1150 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.36408767104148865, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1160 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.3795453906059265, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1170 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.34415504336357117, + "learning_rate": 0.0002, + "loss": 1.7726, + "step": 1180 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.3491021394729614, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1190 + }, + { + "epoch": 2.0, + "eval_loss": 1.8182536363601685, + "eval_runtime": 87.8767, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.74, + "step": 1198 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.36758512258529663, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 1200 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.36278557777404785, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1210 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.4186977744102478, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 1220 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.3958706855773926, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 1230 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.43305638432502747, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1240 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.4509678781032562, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 1250 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.4297264516353607, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1260 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.4579504132270813, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1270 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.4223267138004303, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1280 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 0.41538703441619873, + "learning_rate": 0.0002, + "loss": 1.6472, + "step": 1290 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.4987374544143677, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 1300 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.45300114154815674, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1310 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.4577588737010956, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1320 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.4110747277736664, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1330 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5107163190841675, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1340 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.41190820932388306, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 1350 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.47458386421203613, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 1360 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.42136940360069275, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1370 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.48292383551597595, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 1380 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.4519229531288147, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 1390 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.5860922336578369, + "learning_rate": 0.0002, + "loss": 1.6408, + "step": 1400 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.4362313747406006, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 1410 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.46916621923446655, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1420 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.5249663591384888, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1430 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.4764375388622284, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1440 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.46573784947395325, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1450 + }, + { + "epoch": 2.4373956594323873, + "grad_norm": 0.44539371132850647, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.4540901502504173, + "grad_norm": 0.40925896167755127, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 1470 + }, + { + "epoch": 2.4707846410684473, + "grad_norm": 0.4431462287902832, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 1480 + }, + { + "epoch": 2.4874791318864773, + "grad_norm": 0.5476022362709045, + "learning_rate": 0.0002, + "loss": 1.692, + "step": 1490 + }, + { + "epoch": 2.5041736227045073, + "grad_norm": 0.44762539863586426, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1500 + }, + { + "epoch": 2.5208681135225377, + "grad_norm": 0.5470041632652283, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1510 + }, + { + "epoch": 2.5375626043405677, + "grad_norm": 0.4739997088909149, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 1520 + }, + { + "epoch": 2.5542570951585977, + "grad_norm": 0.47115322947502136, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1530 + }, + { + "epoch": 2.5709515859766277, + "grad_norm": 0.49705708026885986, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1540 + }, + { + "epoch": 2.5876460767946576, + "grad_norm": 0.5537301301956177, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1550 + }, + { + "epoch": 2.604340567612688, + "grad_norm": 0.46930626034736633, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 1560 + }, + { + "epoch": 2.621035058430718, + "grad_norm": 0.42371469736099243, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 1570 + }, + { + "epoch": 2.637729549248748, + "grad_norm": 0.49005603790283203, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1580 + }, + { + "epoch": 2.654424040066778, + "grad_norm": 0.4646829068660736, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1590 + }, + { + "epoch": 2.671118530884808, + "grad_norm": 0.5091238617897034, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1600 + }, + { + "epoch": 2.687813021702838, + "grad_norm": 0.4889985918998718, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 1610 + }, + { + "epoch": 2.704507512520868, + "grad_norm": 0.5128234624862671, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1620 + }, + { + "epoch": 2.721202003338898, + "grad_norm": 0.46999186277389526, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1630 + }, + { + "epoch": 2.7378964941569284, + "grad_norm": 0.4949921667575836, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1640 + }, + { + "epoch": 2.7545909849749584, + "grad_norm": 0.4484370946884155, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1650 + }, + { + "epoch": 2.7712854757929883, + "grad_norm": 0.45599570870399475, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1660 + }, + { + "epoch": 2.7879799666110183, + "grad_norm": 0.5093285441398621, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1670 + }, + { + "epoch": 2.8046744574290483, + "grad_norm": 0.44737935066223145, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1680 + }, + { + "epoch": 2.8213689482470787, + "grad_norm": 0.4374251365661621, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 1690 + }, + { + "epoch": 2.8380634390651087, + "grad_norm": 0.44765740633010864, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1700 + }, + { + "epoch": 2.8547579298831387, + "grad_norm": 0.44685253500938416, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 2.8714524207011687, + "grad_norm": 0.44777143001556396, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 1720 + }, + { + "epoch": 2.8881469115191987, + "grad_norm": 0.4178132712841034, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 1730 + }, + { + "epoch": 2.9048414023372287, + "grad_norm": 0.4487852156162262, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 1740 + }, + { + "epoch": 2.9215358931552586, + "grad_norm": 0.47137337923049927, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 1750 + }, + { + "epoch": 2.9382303839732886, + "grad_norm": 0.48543235659599304, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1760 + }, + { + "epoch": 2.9549248747913186, + "grad_norm": 0.4174182116985321, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1770 + }, + { + "epoch": 2.971619365609349, + "grad_norm": 0.43385711312294006, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 1780 + }, + { + "epoch": 2.988313856427379, + "grad_norm": 0.474332332611084, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 1790 + }, + { + "epoch": 3.0, + "eval_loss": 1.8456445932388306, + "eval_runtime": 87.6261, + "eval_samples_per_second": 5.877, + "eval_steps_per_second": 0.742, + "step": 1797 + }, + { + "epoch": 3.005008347245409, + "grad_norm": 0.40323764085769653, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 1800 + }, + { + "epoch": 3.021702838063439, + "grad_norm": 0.45069044828414917, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 1810 + }, + { + "epoch": 3.038397328881469, + "grad_norm": 0.6204925775527954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 1820 + }, + { + "epoch": 3.0550918196994994, + "grad_norm": 0.5857783555984497, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 1830 + }, + { + "epoch": 3.0717863105175294, + "grad_norm": 0.6776524782180786, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 1840 + }, + { + "epoch": 3.0884808013355594, + "grad_norm": 0.5486199855804443, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1850 + }, + { + "epoch": 3.1051752921535893, + "grad_norm": 0.5496503710746765, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1860 + }, + { + "epoch": 3.1218697829716193, + "grad_norm": 0.5602648258209229, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 1870 + }, + { + "epoch": 3.1385642737896493, + "grad_norm": 1.0697380304336548, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1880 + }, + { + "epoch": 3.1552587646076793, + "grad_norm": 0.6087332367897034, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 1890 + }, + { + "epoch": 3.1719532554257097, + "grad_norm": 0.5112161040306091, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 1900 + }, + { + "epoch": 3.1886477462437397, + "grad_norm": 0.6393680572509766, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 1910 + }, + { + "epoch": 3.2053422370617697, + "grad_norm": 0.7201815247535706, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1920 + }, + { + "epoch": 3.2220367278797997, + "grad_norm": 0.5856018662452698, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 1930 + }, + { + "epoch": 3.2387312186978297, + "grad_norm": 0.581247866153717, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 1940 + }, + { + "epoch": 3.2554257095158596, + "grad_norm": 0.6055102944374084, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 1950 + }, + { + "epoch": 3.27212020033389, + "grad_norm": 0.546894371509552, + "learning_rate": 0.0002, + "loss": 1.5086, + "step": 1960 + }, + { + "epoch": 3.28881469115192, + "grad_norm": 0.565558910369873, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 1970 + }, + { + "epoch": 3.30550918196995, + "grad_norm": 1.2238883972167969, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 1980 + }, + { + "epoch": 3.32220367278798, + "grad_norm": 0.6362585425376892, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 1990 + }, + { + "epoch": 3.33889816360601, + "grad_norm": 0.6131124496459961, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 2000 + }, + { + "epoch": 3.35559265442404, + "grad_norm": 0.5181341767311096, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2010 + }, + { + "epoch": 3.37228714524207, + "grad_norm": 0.6667609810829163, + "learning_rate": 0.0002, + "loss": 1.5039, + "step": 2020 + }, + { + "epoch": 3.3889816360601, + "grad_norm": 0.6488749980926514, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 2030 + }, + { + "epoch": 3.4056761268781304, + "grad_norm": 0.5693286061286926, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 2040 + }, + { + "epoch": 3.4223706176961604, + "grad_norm": 0.6154143810272217, + "learning_rate": 0.0002, + "loss": 1.5121, + "step": 2050 + }, + { + "epoch": 3.4390651085141903, + "grad_norm": 0.6747981309890747, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2060 + }, + { + "epoch": 3.4557595993322203, + "grad_norm": 0.5494789481163025, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2070 + }, + { + "epoch": 3.4724540901502503, + "grad_norm": 2.481968402862549, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2080 + }, + { + "epoch": 3.4891485809682803, + "grad_norm": 0.589784562587738, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2090 + }, + { + "epoch": 3.5058430717863107, + "grad_norm": 0.6449820399284363, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 2100 + }, + { + "epoch": 3.5225375626043407, + "grad_norm": 0.6467038989067078, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2110 + }, + { + "epoch": 3.5392320534223707, + "grad_norm": 0.6533533334732056, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2120 + }, + { + "epoch": 3.5559265442404007, + "grad_norm": 0.6804035902023315, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2130 + }, + { + "epoch": 3.5726210350584306, + "grad_norm": 0.628773033618927, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2140 + }, + { + "epoch": 3.5893155258764606, + "grad_norm": 0.6055739521980286, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 2150 + }, + { + "epoch": 3.6060100166944906, + "grad_norm": 0.6000894904136658, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 2160 + }, + { + "epoch": 3.6227045075125206, + "grad_norm": 0.5862473249435425, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 2170 + }, + { + "epoch": 3.639398998330551, + "grad_norm": 0.6547419428825378, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 2180 + }, + { + "epoch": 3.656093489148581, + "grad_norm": 0.5610318779945374, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 2190 + }, + { + "epoch": 3.672787979966611, + "grad_norm": 0.6387564539909363, + "learning_rate": 0.0002, + "loss": 1.4814, + "step": 2200 + }, + { + "epoch": 3.689482470784641, + "grad_norm": 0.6065090894699097, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 2210 + }, + { + "epoch": 3.706176961602671, + "grad_norm": 0.6266646981239319, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2220 + }, + { + "epoch": 3.7228714524207014, + "grad_norm": 0.626944363117218, + "learning_rate": 0.0002, + "loss": 1.5146, + "step": 2230 + }, + { + "epoch": 3.7395659432387314, + "grad_norm": 0.6043975949287415, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 2240 + }, + { + "epoch": 3.7562604340567614, + "grad_norm": 0.599732518196106, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 2250 + }, + { + "epoch": 3.7729549248747913, + "grad_norm": 0.6738389134407043, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 2260 + }, + { + "epoch": 3.7896494156928213, + "grad_norm": 0.5561335682868958, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 2270 + }, + { + "epoch": 3.8063439065108513, + "grad_norm": 0.6185726523399353, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2280 + }, + { + "epoch": 3.8230383973288813, + "grad_norm": 0.6151532530784607, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 2290 + }, + { + "epoch": 3.8397328881469113, + "grad_norm": 0.5808233022689819, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 2300 + }, + { + "epoch": 3.8564273789649417, + "grad_norm": 0.6615163683891296, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2310 + }, + { + "epoch": 3.8731218697829717, + "grad_norm": 0.5832979679107666, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 2320 + }, + { + "epoch": 3.8898163606010017, + "grad_norm": 0.6119300127029419, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2330 + }, + { + "epoch": 3.9065108514190316, + "grad_norm": 0.6489697694778442, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 2340 + }, + { + "epoch": 3.9232053422370616, + "grad_norm": 0.5539063215255737, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2350 + }, + { + "epoch": 3.939899833055092, + "grad_norm": 0.6062877178192139, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2360 + }, + { + "epoch": 3.956594323873122, + "grad_norm": 0.680609941482544, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2370 + }, + { + "epoch": 3.973288814691152, + "grad_norm": 0.6176834106445312, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 2380 + }, + { + "epoch": 3.989983305509182, + "grad_norm": 0.6538102030754089, + "learning_rate": 0.0002, + "loss": 1.4984, + "step": 2390 + }, + { + "epoch": 4.0, + "eval_loss": 1.8920671939849854, + "eval_runtime": 76.5227, + "eval_samples_per_second": 6.73, + "eval_steps_per_second": 0.849, + "step": 2396 + }, + { + "epoch": 4.006677796327212, + "grad_norm": 0.5683762431144714, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2400 + }, + { + "epoch": 4.023372287145242, + "grad_norm": 0.6858044862747192, + "learning_rate": 0.0002, + "loss": 1.3387, + "step": 2410 + }, + { + "epoch": 4.040066777963272, + "grad_norm": 0.7614858150482178, + "learning_rate": 0.0002, + "loss": 1.4495, + "step": 2420 + }, + { + "epoch": 4.056761268781302, + "grad_norm": 0.709412693977356, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 2430 + }, + { + "epoch": 4.073455759599332, + "grad_norm": 0.7070785760879517, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 2440 + }, + { + "epoch": 4.090150250417362, + "grad_norm": 0.8815216422080994, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2450 + }, + { + "epoch": 4.106844741235392, + "grad_norm": 0.759981632232666, + "learning_rate": 0.0002, + "loss": 1.3731, + "step": 2460 + }, + { + "epoch": 4.123539232053423, + "grad_norm": 0.6715240478515625, + "learning_rate": 0.0002, + "loss": 1.3393, + "step": 2470 + }, + { + "epoch": 4.140233722871453, + "grad_norm": 0.7503564953804016, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 2480 + }, + { + "epoch": 4.156928213689483, + "grad_norm": 0.773743748664856, + "learning_rate": 0.0002, + "loss": 1.324, + "step": 2490 + }, + { + "epoch": 4.173622704507513, + "grad_norm": 0.8850100040435791, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 2500 + }, + { + "epoch": 4.190317195325543, + "grad_norm": 0.7575962543487549, + "learning_rate": 0.0002, + "loss": 1.3183, + "step": 2510 + }, + { + "epoch": 4.207011686143573, + "grad_norm": 0.9117498397827148, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 2520 + }, + { + "epoch": 4.223706176961603, + "grad_norm": 0.7637559175491333, + "learning_rate": 0.0002, + "loss": 1.3242, + "step": 2530 + }, + { + "epoch": 4.240400667779633, + "grad_norm": 0.8178390264511108, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2540 + }, + { + "epoch": 4.257095158597663, + "grad_norm": 0.8299263119697571, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2550 + }, + { + "epoch": 4.273789649415693, + "grad_norm": 0.7238091230392456, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2560 + }, + { + "epoch": 4.290484140233723, + "grad_norm": 0.7468036413192749, + "learning_rate": 0.0002, + "loss": 1.349, + "step": 2570 + }, + { + "epoch": 4.307178631051753, + "grad_norm": 0.8012791275978088, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 2580 + }, + { + "epoch": 4.323873121869783, + "grad_norm": 0.8302484154701233, + "learning_rate": 0.0002, + "loss": 1.3723, + "step": 2590 + }, + { + "epoch": 4.340567612687813, + "grad_norm": 0.751864492893219, + "learning_rate": 0.0002, + "loss": 1.4013, + "step": 2600 + }, + { + "epoch": 4.357262103505843, + "grad_norm": 0.8025410175323486, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2610 + }, + { + "epoch": 4.373956594323873, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 2620 + }, + { + "epoch": 4.390651085141903, + "grad_norm": 0.8526890873908997, + "learning_rate": 0.0002, + "loss": 1.3721, + "step": 2630 + }, + { + "epoch": 4.407345575959933, + "grad_norm": 1.0536625385284424, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2640 + }, + { + "epoch": 4.424040066777963, + "grad_norm": 0.7223818898200989, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 2650 + }, + { + "epoch": 4.440734557595993, + "grad_norm": 0.7981253266334534, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2660 + }, + { + "epoch": 4.457429048414023, + "grad_norm": 0.7136162519454956, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2670 + }, + { + "epoch": 4.474123539232053, + "grad_norm": 0.8008312582969666, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 2680 + }, + { + "epoch": 4.490818030050083, + "grad_norm": 0.7924065589904785, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 2690 + }, + { + "epoch": 4.507512520868113, + "grad_norm": 0.8224287629127502, + "learning_rate": 0.0002, + "loss": 1.402, + "step": 2700 + }, + { + "epoch": 4.524207011686143, + "grad_norm": 0.7494375109672546, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 2710 + }, + { + "epoch": 4.540901502504173, + "grad_norm": 0.8097899556159973, + "learning_rate": 0.0002, + "loss": 1.4471, + "step": 2720 + }, + { + "epoch": 4.557595993322204, + "grad_norm": 0.7728819251060486, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2730 + }, + { + "epoch": 4.574290484140234, + "grad_norm": 0.9112362265586853, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 2740 + }, + { + "epoch": 4.590984974958264, + "grad_norm": 0.7502672076225281, + "learning_rate": 0.0002, + "loss": 1.4601, + "step": 2750 + }, + { + "epoch": 4.607679465776294, + "grad_norm": 0.8816406726837158, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 2760 + }, + { + "epoch": 4.624373956594324, + "grad_norm": 0.7117180228233337, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2770 + }, + { + "epoch": 4.641068447412354, + "grad_norm": 0.8224529027938843, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2780 + }, + { + "epoch": 4.657762938230384, + "grad_norm": 0.7625266313552856, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 2790 + }, + { + "epoch": 4.674457429048414, + "grad_norm": 0.7754318118095398, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2800 + }, + { + "epoch": 4.691151919866444, + "grad_norm": 0.7907336354255676, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 2810 + }, + { + "epoch": 4.707846410684474, + "grad_norm": 0.7377734780311584, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2820 + }, + { + "epoch": 4.724540901502504, + "grad_norm": 0.7380456328392029, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 2830 + }, + { + "epoch": 4.741235392320534, + "grad_norm": 0.7148023247718811, + "learning_rate": 0.0002, + "loss": 1.4405, + "step": 2840 + }, + { + "epoch": 4.757929883138564, + "grad_norm": 0.807048499584198, + "learning_rate": 0.0002, + "loss": 1.4025, + "step": 2850 + }, + { + "epoch": 4.774624373956595, + "grad_norm": 0.8444154858589172, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 2860 + }, + { + "epoch": 4.791318864774624, + "grad_norm": 0.8328704237937927, + "learning_rate": 0.0002, + "loss": 1.4282, + "step": 2870 + }, + { + "epoch": 4.808013355592655, + "grad_norm": 0.89827960729599, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 2880 + }, + { + "epoch": 4.824707846410685, + "grad_norm": 0.7848225831985474, + "learning_rate": 0.0002, + "loss": 1.4488, + "step": 2890 + }, + { + "epoch": 4.841402337228715, + "grad_norm": 0.703802227973938, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 2900 + }, + { + "epoch": 4.858096828046745, + "grad_norm": 0.8092581629753113, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 2910 + }, + { + "epoch": 4.874791318864775, + "grad_norm": 0.7537722587585449, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 2920 + }, + { + "epoch": 4.891485809682805, + "grad_norm": 0.7966470122337341, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 2930 + }, + { + "epoch": 4.908180300500835, + "grad_norm": 0.7860329747200012, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 2940 + }, + { + "epoch": 4.924874791318865, + "grad_norm": 0.7964439988136292, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 2950 + }, + { + "epoch": 4.941569282136895, + "grad_norm": 0.740288257598877, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 2960 + }, + { + "epoch": 4.958263772954925, + "grad_norm": 0.7377685904502869, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 2970 + }, + { + "epoch": 4.974958263772955, + "grad_norm": 0.793484628200531, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2980 + }, + { + "epoch": 4.9916527545909855, + "grad_norm": 0.7710573077201843, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 2990 + }, + { + "epoch": 5.0, + "eval_loss": 1.9764225482940674, + "eval_runtime": 87.968, + "eval_samples_per_second": 5.854, + "eval_steps_per_second": 0.739, + "step": 2995 + }, + { + "epoch": 5.008347245409015, + "grad_norm": 0.680841326713562, + "learning_rate": 0.0002, + "loss": 1.3493, + "step": 3000 + }, + { + "epoch": 5.025041736227045, + "grad_norm": 0.8790825009346008, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 3010 + }, + { + "epoch": 5.041736227045075, + "grad_norm": 1.1519404649734497, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 3020 + }, + { + "epoch": 5.058430717863105, + "grad_norm": 1.1939337253570557, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 3030 + }, + { + "epoch": 5.075125208681135, + "grad_norm": 1.1471049785614014, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 3040 + }, + { + "epoch": 5.091819699499165, + "grad_norm": 1.0808285474777222, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 3050 + }, + { + "epoch": 5.108514190317195, + "grad_norm": 1.0102492570877075, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 3060 + }, + { + "epoch": 5.125208681135225, + "grad_norm": 0.9869397282600403, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 3070 + }, + { + "epoch": 5.141903171953255, + "grad_norm": 0.9689525365829468, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 3080 + }, + { + "epoch": 5.158597662771285, + "grad_norm": 0.9293769598007202, + "learning_rate": 0.0002, + "loss": 1.1747, + "step": 3090 + }, + { + "epoch": 5.175292153589315, + "grad_norm": 0.9289103150367737, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 3100 + }, + { + "epoch": 5.191986644407345, + "grad_norm": 0.9736173152923584, + "learning_rate": 0.0002, + "loss": 1.2538, + "step": 3110 + }, + { + "epoch": 5.208681135225375, + "grad_norm": 1.3144289255142212, + "learning_rate": 0.0002, + "loss": 1.2429, + "step": 3120 + }, + { + "epoch": 5.225375626043405, + "grad_norm": 0.95982825756073, + "learning_rate": 0.0002, + "loss": 1.2107, + "step": 3130 + }, + { + "epoch": 5.242070116861436, + "grad_norm": 0.903189480304718, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 3140 + }, + { + "epoch": 5.258764607679466, + "grad_norm": 1.056692123413086, + "learning_rate": 0.0002, + "loss": 1.2663, + "step": 3150 + }, + { + "epoch": 5.275459098497496, + "grad_norm": 1.1169359683990479, + "learning_rate": 0.0002, + "loss": 1.2955, + "step": 3160 + }, + { + "epoch": 5.292153589315526, + "grad_norm": 1.2178374528884888, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 3170 + }, + { + "epoch": 5.308848080133556, + "grad_norm": 0.9956373572349548, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 3180 + }, + { + "epoch": 5.325542570951586, + "grad_norm": 0.959555447101593, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 3190 + }, + { + "epoch": 5.342237061769616, + "grad_norm": 0.9343846440315247, + "learning_rate": 0.0002, + "loss": 1.1817, + "step": 3200 + }, + { + "epoch": 5.358931552587646, + "grad_norm": 0.8806524872779846, + "learning_rate": 0.0002, + "loss": 1.2033, + "step": 3210 + }, + { + "epoch": 5.375626043405676, + "grad_norm": 0.9477803111076355, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 3220 + }, + { + "epoch": 5.392320534223706, + "grad_norm": 0.9975674152374268, + "learning_rate": 0.0002, + "loss": 1.2011, + "step": 3230 + }, + { + "epoch": 5.409015025041736, + "grad_norm": 0.9650071263313293, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 3240 + }, + { + "epoch": 5.425709515859766, + "grad_norm": 1.0170838832855225, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 3250 + }, + { + "epoch": 5.442404006677796, + "grad_norm": 1.158118486404419, + "learning_rate": 0.0002, + "loss": 1.2635, + "step": 3260 + }, + { + "epoch": 5.459098497495827, + "grad_norm": 1.0228497982025146, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 3270 + }, + { + "epoch": 5.475792988313857, + "grad_norm": 1.0101768970489502, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 3280 + }, + { + "epoch": 5.492487479131887, + "grad_norm": 1.0407295227050781, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 3290 + }, + { + "epoch": 5.509181969949917, + "grad_norm": 0.9337932467460632, + "learning_rate": 0.0002, + "loss": 1.2062, + "step": 3300 + }, + { + "epoch": 5.525876460767947, + "grad_norm": 1.0305527448654175, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 3310 + }, + { + "epoch": 5.542570951585977, + "grad_norm": 1.0523453950881958, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3320 + }, + { + "epoch": 5.559265442404007, + "grad_norm": 0.9707391858100891, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3330 + }, + { + "epoch": 5.575959933222037, + "grad_norm": 1.0054972171783447, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3340 + }, + { + "epoch": 5.592654424040067, + "grad_norm": 1.0393340587615967, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 3350 + }, + { + "epoch": 5.609348914858097, + "grad_norm": 1.0671277046203613, + "learning_rate": 0.0002, + "loss": 1.2328, + "step": 3360 + }, + { + "epoch": 5.626043405676127, + "grad_norm": 1.0725873708724976, + "learning_rate": 0.0002, + "loss": 1.2415, + "step": 3370 + }, + { + "epoch": 5.642737896494157, + "grad_norm": 0.9844746589660645, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 3380 + }, + { + "epoch": 5.659432387312187, + "grad_norm": 0.9659736752510071, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 3390 + }, + { + "epoch": 5.676126878130217, + "grad_norm": 0.9152608513832092, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 3400 + }, + { + "epoch": 5.692821368948247, + "grad_norm": 0.9759509563446045, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 3410 + }, + { + "epoch": 5.709515859766277, + "grad_norm": 1.0662057399749756, + "learning_rate": 0.0002, + "loss": 1.2264, + "step": 3420 + }, + { + "epoch": 5.726210350584307, + "grad_norm": 0.9780185222625732, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 3430 + }, + { + "epoch": 5.742904841402337, + "grad_norm": 0.9781617522239685, + "learning_rate": 0.0002, + "loss": 1.2603, + "step": 3440 + }, + { + "epoch": 5.759599332220367, + "grad_norm": 1.0790785551071167, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 3450 + }, + { + "epoch": 5.776293823038397, + "grad_norm": 1.0573410987854004, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 3460 + }, + { + "epoch": 5.792988313856427, + "grad_norm": 0.9953364729881287, + "learning_rate": 0.0002, + "loss": 1.2591, + "step": 3470 + }, + { + "epoch": 5.809682804674457, + "grad_norm": 1.0072667598724365, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 3480 + }, + { + "epoch": 5.826377295492487, + "grad_norm": 0.9312750697135925, + "learning_rate": 0.0002, + "loss": 1.286, + "step": 3490 + }, + { + "epoch": 5.843071786310517, + "grad_norm": 1.059614896774292, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 3500 + }, + { + "epoch": 5.859766277128547, + "grad_norm": 1.2089484930038452, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 3510 + }, + { + "epoch": 5.876460767946577, + "grad_norm": 1.0740607976913452, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 3520 + }, + { + "epoch": 5.893155258764608, + "grad_norm": 0.9620149731636047, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 3530 + }, + { + "epoch": 5.909849749582638, + "grad_norm": 1.0482431650161743, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 3540 + }, + { + "epoch": 5.926544240400668, + "grad_norm": 0.9137503504753113, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 3550 + }, + { + "epoch": 5.943238731218698, + "grad_norm": 1.1599403619766235, + "learning_rate": 0.0002, + "loss": 1.3066, + "step": 3560 + }, + { + "epoch": 5.959933222036728, + "grad_norm": 0.911613404750824, + "learning_rate": 0.0002, + "loss": 1.2556, + "step": 3570 + }, + { + "epoch": 5.976627712854758, + "grad_norm": 0.9120033383369446, + "learning_rate": 0.0002, + "loss": 1.2746, + "step": 3580 + }, + { + "epoch": 5.993322203672788, + "grad_norm": 1.0588736534118652, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 3590 + }, + { + "epoch": 6.0, + "eval_loss": 2.0921614170074463, + "eval_runtime": 71.974, + "eval_samples_per_second": 7.155, + "eval_steps_per_second": 0.903, + "step": 3594 + }, + { + "epoch": 6.010016694490818, + "grad_norm": 0.9213348627090454, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 3600 + }, + { + "epoch": 6.026711185308848, + "grad_norm": 1.137640357017517, + "learning_rate": 0.0002, + "loss": 1.07, + "step": 3610 + }, + { + "epoch": 6.043405676126878, + "grad_norm": 1.200276494026184, + "learning_rate": 0.0002, + "loss": 0.9953, + "step": 3620 + }, + { + "epoch": 6.060100166944908, + "grad_norm": 1.335649013519287, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 3630 + }, + { + "epoch": 6.076794657762938, + "grad_norm": 1.1353906393051147, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 3640 + }, + { + "epoch": 6.093489148580968, + "grad_norm": 1.0406795740127563, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 3650 + }, + { + "epoch": 6.110183639398999, + "grad_norm": 1.2691017389297485, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 3660 + }, + { + "epoch": 6.126878130217029, + "grad_norm": 1.3334898948669434, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 3670 + }, + { + "epoch": 6.143572621035059, + "grad_norm": 1.1766020059585571, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 3680 + }, + { + "epoch": 6.160267111853089, + "grad_norm": 1.1079157590866089, + "learning_rate": 0.0002, + "loss": 1.0431, + "step": 3690 + }, + { + "epoch": 6.176961602671119, + "grad_norm": 1.4312299489974976, + "learning_rate": 0.0002, + "loss": 1.0395, + "step": 3700 + }, + { + "epoch": 6.193656093489149, + "grad_norm": 1.2636224031448364, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 3710 + }, + { + "epoch": 6.210350584307179, + "grad_norm": 1.1957253217697144, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 3720 + }, + { + "epoch": 6.227045075125209, + "grad_norm": 1.1044131517410278, + "learning_rate": 0.0002, + "loss": 1.0199, + "step": 3730 + }, + { + "epoch": 6.243739565943239, + "grad_norm": 1.2045193910598755, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3740 + }, + { + "epoch": 6.260434056761269, + "grad_norm": 1.0740957260131836, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 3750 + }, + { + "epoch": 6.277128547579299, + "grad_norm": 1.1548833847045898, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 3760 + }, + { + "epoch": 6.293823038397329, + "grad_norm": 1.257440209388733, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 3770 + }, + { + "epoch": 6.310517529215359, + "grad_norm": 1.1988940238952637, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 3780 + }, + { + "epoch": 6.3272120200333895, + "grad_norm": 1.1707229614257812, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 3790 + }, + { + "epoch": 6.343906510851419, + "grad_norm": 1.360107660293579, + "learning_rate": 0.0002, + "loss": 1.053, + "step": 3800 + }, + { + "epoch": 6.360601001669449, + "grad_norm": 1.249742031097412, + "learning_rate": 0.0002, + "loss": 1.0637, + "step": 3810 + }, + { + "epoch": 6.377295492487479, + "grad_norm": 1.2729560136795044, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 3820 + }, + { + "epoch": 6.393989983305509, + "grad_norm": 1.241761565208435, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 3830 + }, + { + "epoch": 6.410684474123539, + "grad_norm": 1.1892873048782349, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3840 + }, + { + "epoch": 6.427378964941569, + "grad_norm": 1.1766357421875, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 3850 + }, + { + "epoch": 6.444073455759599, + "grad_norm": 1.2642168998718262, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 3860 + }, + { + "epoch": 6.460767946577629, + "grad_norm": 1.3390182256698608, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 3870 + }, + { + "epoch": 6.477462437395659, + "grad_norm": 1.183168649673462, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 3880 + }, + { + "epoch": 6.494156928213689, + "grad_norm": 1.1458892822265625, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 3890 + }, + { + "epoch": 6.510851419031719, + "grad_norm": 1.2736095190048218, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 3900 + }, + { + "epoch": 6.527545909849749, + "grad_norm": 1.323607087135315, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 3910 + }, + { + "epoch": 6.54424040066778, + "grad_norm": 1.2177817821502686, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 3920 + }, + { + "epoch": 6.560934891485809, + "grad_norm": 1.3270750045776367, + "learning_rate": 0.0002, + "loss": 1.0333, + "step": 3930 + }, + { + "epoch": 6.57762938230384, + "grad_norm": 1.0974372625350952, + "learning_rate": 0.0002, + "loss": 1.0589, + "step": 3940 + }, + { + "epoch": 6.59432387312187, + "grad_norm": 1.3352670669555664, + "learning_rate": 0.0002, + "loss": 1.1347, + "step": 3950 + }, + { + "epoch": 6.6110183639399, + "grad_norm": 1.3174126148223877, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 3960 + }, + { + "epoch": 6.62771285475793, + "grad_norm": 1.1783626079559326, + "learning_rate": 0.0002, + "loss": 1.1697, + "step": 3970 + }, + { + "epoch": 6.64440734557596, + "grad_norm": 1.1886446475982666, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 3980 + }, + { + "epoch": 6.66110183639399, + "grad_norm": 1.2215187549591064, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 3990 + }, + { + "epoch": 6.67779632721202, + "grad_norm": 1.0320725440979004, + "learning_rate": 0.0002, + "loss": 1.1236, + "step": 4000 + }, + { + "epoch": 6.69449081803005, + "grad_norm": 1.340338110923767, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 4010 + }, + { + "epoch": 6.71118530884808, + "grad_norm": 1.1496273279190063, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 4020 + }, + { + "epoch": 6.72787979966611, + "grad_norm": 1.5720409154891968, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 4030 + }, + { + "epoch": 6.74457429048414, + "grad_norm": 1.497376799583435, + "learning_rate": 0.0002, + "loss": 1.1385, + "step": 4040 + }, + { + "epoch": 6.76126878130217, + "grad_norm": 1.1594456434249878, + "learning_rate": 0.0002, + "loss": 1.0808, + "step": 4050 + }, + { + "epoch": 6.7779632721202, + "grad_norm": 1.326546549797058, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 4060 + }, + { + "epoch": 6.794657762938231, + "grad_norm": 1.18723726272583, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 4070 + }, + { + "epoch": 6.811352253756261, + "grad_norm": 1.2974154949188232, + "learning_rate": 0.0002, + "loss": 1.1906, + "step": 4080 + }, + { + "epoch": 6.828046744574291, + "grad_norm": 1.207748532295227, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 4090 + }, + { + "epoch": 6.844741235392321, + "grad_norm": 1.2398537397384644, + "learning_rate": 0.0002, + "loss": 1.0951, + "step": 4100 + }, + { + "epoch": 6.861435726210351, + "grad_norm": 1.1657508611679077, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4110 + }, + { + "epoch": 6.878130217028381, + "grad_norm": 1.1986382007598877, + "learning_rate": 0.0002, + "loss": 1.1315, + "step": 4120 + }, + { + "epoch": 6.894824707846411, + "grad_norm": 1.407080054283142, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 4130 + }, + { + "epoch": 6.911519198664441, + "grad_norm": 1.0725297927856445, + "learning_rate": 0.0002, + "loss": 1.0515, + "step": 4140 + }, + { + "epoch": 6.928213689482471, + "grad_norm": 1.2659991979599, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 4150 + }, + { + "epoch": 6.944908180300501, + "grad_norm": 1.0579404830932617, + "learning_rate": 0.0002, + "loss": 1.1373, + "step": 4160 + }, + { + "epoch": 6.961602671118531, + "grad_norm": 1.254502296447754, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 4170 + }, + { + "epoch": 6.978297161936561, + "grad_norm": 1.2666021585464478, + "learning_rate": 0.0002, + "loss": 1.1019, + "step": 4180 + }, + { + "epoch": 6.994991652754591, + "grad_norm": 1.236793041229248, + "learning_rate": 0.0002, + "loss": 1.0675, + "step": 4190 + }, + { + "epoch": 7.0, + "eval_loss": 2.211871862411499, + "eval_runtime": 56.9215, + "eval_samples_per_second": 9.048, + "eval_steps_per_second": 1.142, + "step": 4193 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.940426514927452e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4193/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..697ac2e37dc40bb937a2d001b9ae26a5ceb5c890 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e368de0ac09a5e846d76689d63ae05dfae328aa448ca771babfc3ce2cf617069 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3a699b49f58a79260584b753e821d46975e1874 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad67d1f8585a414de24df664b84c0f8504418fe3bbd393d4a57a072bee0fd6da +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..87cac87b56c8a3ea92f4c357c7e2e4f144056649 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19540b8e52fabcb6426973067d9c965873af0c9075e5c6f66305ebf16e7f8f4 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0a2e7e9fe7974bccac92720a98aa755fe90f8bb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ef13f6815ddc7667d2c2c75c6504e8a6cfe99f1186c3183c03f3dab17ffafd +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d79f95f35e0a649f6e795a4a78ac5648af5be463 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/trainer_state.json @@ -0,0 +1,3450 @@ +{ + "best_metric": 1.8182536363601685, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 4792, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + }, + { + "epoch": 1.001669449081803, + "grad_norm": 0.6550514698028564, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 600 + }, + { + "epoch": 1.018363939899833, + "grad_norm": 0.36824166774749756, + "learning_rate": 0.0002, + "loss": 1.7601, + "step": 610 + }, + { + "epoch": 1.0350584307178632, + "grad_norm": 0.34707099199295044, + "learning_rate": 0.0002, + "loss": 1.7684, + "step": 620 + }, + { + "epoch": 1.0517529215358932, + "grad_norm": 0.38599663972854614, + "learning_rate": 0.0002, + "loss": 1.7552, + "step": 630 + }, + { + "epoch": 1.0684474123539232, + "grad_norm": 0.34381693601608276, + "learning_rate": 0.0002, + "loss": 1.7396, + "step": 640 + }, + { + "epoch": 1.0851419031719534, + "grad_norm": 0.3657481372356415, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 650 + }, + { + "epoch": 1.1018363939899833, + "grad_norm": 0.3310803771018982, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 660 + }, + { + "epoch": 1.1185308848080133, + "grad_norm": 0.37122875452041626, + "learning_rate": 0.0002, + "loss": 1.7788, + "step": 670 + }, + { + "epoch": 1.1352253756260433, + "grad_norm": 0.3976633548736572, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 680 + }, + { + "epoch": 1.1519198664440735, + "grad_norm": 0.37567615509033203, + "learning_rate": 0.0002, + "loss": 1.7398, + "step": 690 + }, + { + "epoch": 1.1686143572621035, + "grad_norm": 0.3683645725250244, + "learning_rate": 0.0002, + "loss": 1.7326, + "step": 700 + }, + { + "epoch": 1.1853088480801335, + "grad_norm": 0.3862009644508362, + "learning_rate": 0.0002, + "loss": 1.8074, + "step": 710 + }, + { + "epoch": 1.2020033388981637, + "grad_norm": 0.3478439450263977, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 720 + }, + { + "epoch": 1.2186978297161937, + "grad_norm": 0.3694932162761688, + "learning_rate": 0.0002, + "loss": 1.7945, + "step": 730 + }, + { + "epoch": 1.2353923205342237, + "grad_norm": 0.3661787211894989, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 740 + }, + { + "epoch": 1.2520868113522536, + "grad_norm": 0.372951865196228, + "learning_rate": 0.0002, + "loss": 1.7561, + "step": 750 + }, + { + "epoch": 1.2687813021702838, + "grad_norm": 0.38718998432159424, + "learning_rate": 0.0002, + "loss": 1.7507, + "step": 760 + }, + { + "epoch": 1.2854757929883138, + "grad_norm": 0.37488260865211487, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 770 + }, + { + "epoch": 1.302170283806344, + "grad_norm": 0.34794917702674866, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 780 + }, + { + "epoch": 1.318864774624374, + "grad_norm": 0.3627476990222931, + "learning_rate": 0.0002, + "loss": 1.7592, + "step": 790 + }, + { + "epoch": 1.335559265442404, + "grad_norm": 0.3773096799850464, + "learning_rate": 0.0002, + "loss": 1.6837, + "step": 800 + }, + { + "epoch": 1.352253756260434, + "grad_norm": 0.36476725339889526, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 810 + }, + { + "epoch": 1.3689482470784642, + "grad_norm": 0.3767942190170288, + "learning_rate": 0.0002, + "loss": 1.7958, + "step": 820 + }, + { + "epoch": 1.3856427378964942, + "grad_norm": 0.3502795398235321, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 830 + }, + { + "epoch": 1.4023372287145242, + "grad_norm": 0.4008622169494629, + "learning_rate": 0.0002, + "loss": 1.736, + "step": 840 + }, + { + "epoch": 1.4190317195325544, + "grad_norm": 0.4029707610607147, + "learning_rate": 0.0002, + "loss": 1.7211, + "step": 850 + }, + { + "epoch": 1.4357262103505843, + "grad_norm": 0.41480565071105957, + "learning_rate": 0.0002, + "loss": 1.712, + "step": 860 + }, + { + "epoch": 1.4524207011686143, + "grad_norm": 0.4351646900177002, + "learning_rate": 0.0002, + "loss": 1.7287, + "step": 870 + }, + { + "epoch": 1.4691151919866443, + "grad_norm": 0.4053232967853546, + "learning_rate": 0.0002, + "loss": 1.8436, + "step": 880 + }, + { + "epoch": 1.4858096828046745, + "grad_norm": 0.3515186607837677, + "learning_rate": 0.0002, + "loss": 1.7669, + "step": 890 + }, + { + "epoch": 1.5025041736227045, + "grad_norm": 0.42895469069480896, + "learning_rate": 0.0002, + "loss": 1.698, + "step": 900 + }, + { + "epoch": 1.5191986644407347, + "grad_norm": 0.40897831320762634, + "learning_rate": 0.0002, + "loss": 1.7064, + "step": 910 + }, + { + "epoch": 1.5358931552587647, + "grad_norm": 0.3544739782810211, + "learning_rate": 0.0002, + "loss": 1.7539, + "step": 920 + }, + { + "epoch": 1.5525876460767947, + "grad_norm": 0.3848305642604828, + "learning_rate": 0.0002, + "loss": 1.7596, + "step": 930 + }, + { + "epoch": 1.5692821368948247, + "grad_norm": 0.36952173709869385, + "learning_rate": 0.0002, + "loss": 1.7344, + "step": 940 + }, + { + "epoch": 1.5859766277128546, + "grad_norm": 0.36505743861198425, + "learning_rate": 0.0002, + "loss": 1.7329, + "step": 950 + }, + { + "epoch": 1.6026711185308848, + "grad_norm": 0.3707764446735382, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 960 + }, + { + "epoch": 1.6193656093489148, + "grad_norm": 0.35995468497276306, + "learning_rate": 0.0002, + "loss": 1.7465, + "step": 970 + }, + { + "epoch": 1.636060100166945, + "grad_norm": 0.35458096861839294, + "learning_rate": 0.0002, + "loss": 1.7537, + "step": 980 + }, + { + "epoch": 1.652754590984975, + "grad_norm": 0.3557756841182709, + "learning_rate": 0.0002, + "loss": 1.7075, + "step": 990 + }, + { + "epoch": 1.669449081803005, + "grad_norm": 0.355899453163147, + "learning_rate": 0.0002, + "loss": 1.7307, + "step": 1000 + }, + { + "epoch": 1.686143572621035, + "grad_norm": 0.3709148168563843, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 1010 + }, + { + "epoch": 1.702838063439065, + "grad_norm": 0.3731614947319031, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1020 + }, + { + "epoch": 1.7195325542570952, + "grad_norm": 0.3639261722564697, + "learning_rate": 0.0002, + "loss": 1.741, + "step": 1030 + }, + { + "epoch": 1.7362270450751254, + "grad_norm": 0.36371079087257385, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1040 + }, + { + "epoch": 1.7529215358931554, + "grad_norm": 0.38235539197921753, + "learning_rate": 0.0002, + "loss": 1.7275, + "step": 1050 + }, + { + "epoch": 1.7696160267111853, + "grad_norm": 0.4109364151954651, + "learning_rate": 0.0002, + "loss": 1.7304, + "step": 1060 + }, + { + "epoch": 1.7863105175292153, + "grad_norm": 0.3499647378921509, + "learning_rate": 0.0002, + "loss": 1.7244, + "step": 1070 + }, + { + "epoch": 1.8030050083472453, + "grad_norm": 0.3892260193824768, + "learning_rate": 0.0002, + "loss": 1.7117, + "step": 1080 + }, + { + "epoch": 1.8196994991652755, + "grad_norm": 0.3545094132423401, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 1090 + }, + { + "epoch": 1.8363939899833055, + "grad_norm": 0.40419837832450867, + "learning_rate": 0.0002, + "loss": 1.87, + "step": 1100 + }, + { + "epoch": 1.8530884808013357, + "grad_norm": 0.38423678278923035, + "learning_rate": 0.0002, + "loss": 1.6783, + "step": 1110 + }, + { + "epoch": 1.8697829716193657, + "grad_norm": 0.378408282995224, + "learning_rate": 0.0002, + "loss": 1.7714, + "step": 1120 + }, + { + "epoch": 1.8864774624373957, + "grad_norm": 0.4071941077709198, + "learning_rate": 0.0002, + "loss": 1.7461, + "step": 1130 + }, + { + "epoch": 1.9031719532554257, + "grad_norm": 0.42363739013671875, + "learning_rate": 0.0002, + "loss": 1.7281, + "step": 1140 + }, + { + "epoch": 1.9198664440734556, + "grad_norm": 0.37373560667037964, + "learning_rate": 0.0002, + "loss": 1.819, + "step": 1150 + }, + { + "epoch": 1.9365609348914858, + "grad_norm": 0.36408767104148865, + "learning_rate": 0.0002, + "loss": 1.7847, + "step": 1160 + }, + { + "epoch": 1.9532554257095158, + "grad_norm": 0.3795453906059265, + "learning_rate": 0.0002, + "loss": 1.6725, + "step": 1170 + }, + { + "epoch": 1.969949916527546, + "grad_norm": 0.34415504336357117, + "learning_rate": 0.0002, + "loss": 1.7726, + "step": 1180 + }, + { + "epoch": 1.986644407345576, + "grad_norm": 0.3491021394729614, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1190 + }, + { + "epoch": 2.0, + "eval_loss": 1.8182536363601685, + "eval_runtime": 87.8767, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.74, + "step": 1198 + }, + { + "epoch": 2.003338898163606, + "grad_norm": 0.36758512258529663, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 1200 + }, + { + "epoch": 2.020033388981636, + "grad_norm": 0.36278557777404785, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 1210 + }, + { + "epoch": 2.036727879799666, + "grad_norm": 0.4186977744102478, + "learning_rate": 0.0002, + "loss": 1.6733, + "step": 1220 + }, + { + "epoch": 2.053422370617696, + "grad_norm": 0.3958706855773926, + "learning_rate": 0.0002, + "loss": 1.6977, + "step": 1230 + }, + { + "epoch": 2.0701168614357264, + "grad_norm": 0.43305638432502747, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 1240 + }, + { + "epoch": 2.0868113522537564, + "grad_norm": 0.4509678781032562, + "learning_rate": 0.0002, + "loss": 1.6275, + "step": 1250 + }, + { + "epoch": 2.1035058430717863, + "grad_norm": 0.4297264516353607, + "learning_rate": 0.0002, + "loss": 1.6814, + "step": 1260 + }, + { + "epoch": 2.1202003338898163, + "grad_norm": 0.4579504132270813, + "learning_rate": 0.0002, + "loss": 1.6121, + "step": 1270 + }, + { + "epoch": 2.1368948247078463, + "grad_norm": 0.4223267138004303, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 1280 + }, + { + "epoch": 2.1535893155258763, + "grad_norm": 0.41538703441619873, + "learning_rate": 0.0002, + "loss": 1.6472, + "step": 1290 + }, + { + "epoch": 2.1702838063439067, + "grad_norm": 0.4987374544143677, + "learning_rate": 0.0002, + "loss": 1.6274, + "step": 1300 + }, + { + "epoch": 2.1869782971619367, + "grad_norm": 0.45300114154815674, + "learning_rate": 0.0002, + "loss": 1.6555, + "step": 1310 + }, + { + "epoch": 2.2036727879799667, + "grad_norm": 0.4577588737010956, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1320 + }, + { + "epoch": 2.2203672787979967, + "grad_norm": 0.4110747277736664, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1330 + }, + { + "epoch": 2.2370617696160267, + "grad_norm": 0.5107163190841675, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1340 + }, + { + "epoch": 2.2537562604340566, + "grad_norm": 0.41190820932388306, + "learning_rate": 0.0002, + "loss": 1.691, + "step": 1350 + }, + { + "epoch": 2.2704507512520866, + "grad_norm": 0.47458386421203613, + "learning_rate": 0.0002, + "loss": 1.5745, + "step": 1360 + }, + { + "epoch": 2.287145242070117, + "grad_norm": 0.42136940360069275, + "learning_rate": 0.0002, + "loss": 1.6964, + "step": 1370 + }, + { + "epoch": 2.303839732888147, + "grad_norm": 0.48292383551597595, + "learning_rate": 0.0002, + "loss": 1.657, + "step": 1380 + }, + { + "epoch": 2.320534223706177, + "grad_norm": 0.4519229531288147, + "learning_rate": 0.0002, + "loss": 1.6816, + "step": 1390 + }, + { + "epoch": 2.337228714524207, + "grad_norm": 0.5860922336578369, + "learning_rate": 0.0002, + "loss": 1.6408, + "step": 1400 + }, + { + "epoch": 2.353923205342237, + "grad_norm": 0.4362313747406006, + "learning_rate": 0.0002, + "loss": 1.5551, + "step": 1410 + }, + { + "epoch": 2.370617696160267, + "grad_norm": 0.46916621923446655, + "learning_rate": 0.0002, + "loss": 1.6763, + "step": 1420 + }, + { + "epoch": 2.3873121869782974, + "grad_norm": 0.5249663591384888, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 1430 + }, + { + "epoch": 2.4040066777963274, + "grad_norm": 0.4764375388622284, + "learning_rate": 0.0002, + "loss": 1.6793, + "step": 1440 + }, + { + "epoch": 2.4207011686143574, + "grad_norm": 0.46573784947395325, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1450 + }, + { + "epoch": 2.4373956594323873, + "grad_norm": 0.44539371132850647, + "learning_rate": 0.0002, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.4540901502504173, + "grad_norm": 0.40925896167755127, + "learning_rate": 0.0002, + "loss": 1.6149, + "step": 1470 + }, + { + "epoch": 2.4707846410684473, + "grad_norm": 0.4431462287902832, + "learning_rate": 0.0002, + "loss": 1.6213, + "step": 1480 + }, + { + "epoch": 2.4874791318864773, + "grad_norm": 0.5476022362709045, + "learning_rate": 0.0002, + "loss": 1.692, + "step": 1490 + }, + { + "epoch": 2.5041736227045073, + "grad_norm": 0.44762539863586426, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 1500 + }, + { + "epoch": 2.5208681135225377, + "grad_norm": 0.5470041632652283, + "learning_rate": 0.0002, + "loss": 1.6811, + "step": 1510 + }, + { + "epoch": 2.5375626043405677, + "grad_norm": 0.4739997088909149, + "learning_rate": 0.0002, + "loss": 1.7411, + "step": 1520 + }, + { + "epoch": 2.5542570951585977, + "grad_norm": 0.47115322947502136, + "learning_rate": 0.0002, + "loss": 1.5975, + "step": 1530 + }, + { + "epoch": 2.5709515859766277, + "grad_norm": 0.49705708026885986, + "learning_rate": 0.0002, + "loss": 1.6828, + "step": 1540 + }, + { + "epoch": 2.5876460767946576, + "grad_norm": 0.5537301301956177, + "learning_rate": 0.0002, + "loss": 1.7135, + "step": 1550 + }, + { + "epoch": 2.604340567612688, + "grad_norm": 0.46930626034736633, + "learning_rate": 0.0002, + "loss": 1.5753, + "step": 1560 + }, + { + "epoch": 2.621035058430718, + "grad_norm": 0.42371469736099243, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 1570 + }, + { + "epoch": 2.637729549248748, + "grad_norm": 0.49005603790283203, + "learning_rate": 0.0002, + "loss": 1.6193, + "step": 1580 + }, + { + "epoch": 2.654424040066778, + "grad_norm": 0.4646829068660736, + "learning_rate": 0.0002, + "loss": 1.7138, + "step": 1590 + }, + { + "epoch": 2.671118530884808, + "grad_norm": 0.5091238617897034, + "learning_rate": 0.0002, + "loss": 1.6806, + "step": 1600 + }, + { + "epoch": 2.687813021702838, + "grad_norm": 0.4889985918998718, + "learning_rate": 0.0002, + "loss": 1.65, + "step": 1610 + }, + { + "epoch": 2.704507512520868, + "grad_norm": 0.5128234624862671, + "learning_rate": 0.0002, + "loss": 1.6573, + "step": 1620 + }, + { + "epoch": 2.721202003338898, + "grad_norm": 0.46999186277389526, + "learning_rate": 0.0002, + "loss": 1.6739, + "step": 1630 + }, + { + "epoch": 2.7378964941569284, + "grad_norm": 0.4949921667575836, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1640 + }, + { + "epoch": 2.7545909849749584, + "grad_norm": 0.4484370946884155, + "learning_rate": 0.0002, + "loss": 1.5958, + "step": 1650 + }, + { + "epoch": 2.7712854757929883, + "grad_norm": 0.45599570870399475, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1660 + }, + { + "epoch": 2.7879799666110183, + "grad_norm": 0.5093285441398621, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1670 + }, + { + "epoch": 2.8046744574290483, + "grad_norm": 0.44737935066223145, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1680 + }, + { + "epoch": 2.8213689482470787, + "grad_norm": 0.4374251365661621, + "learning_rate": 0.0002, + "loss": 1.6429, + "step": 1690 + }, + { + "epoch": 2.8380634390651087, + "grad_norm": 0.44765740633010864, + "learning_rate": 0.0002, + "loss": 1.5941, + "step": 1700 + }, + { + "epoch": 2.8547579298831387, + "grad_norm": 0.44685253500938416, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1710 + }, + { + "epoch": 2.8714524207011687, + "grad_norm": 0.44777143001556396, + "learning_rate": 0.0002, + "loss": 1.6301, + "step": 1720 + }, + { + "epoch": 2.8881469115191987, + "grad_norm": 0.4178132712841034, + "learning_rate": 0.0002, + "loss": 1.6473, + "step": 1730 + }, + { + "epoch": 2.9048414023372287, + "grad_norm": 0.4487852156162262, + "learning_rate": 0.0002, + "loss": 1.6491, + "step": 1740 + }, + { + "epoch": 2.9215358931552586, + "grad_norm": 0.47137337923049927, + "learning_rate": 0.0002, + "loss": 1.6616, + "step": 1750 + }, + { + "epoch": 2.9382303839732886, + "grad_norm": 0.48543235659599304, + "learning_rate": 0.0002, + "loss": 1.6767, + "step": 1760 + }, + { + "epoch": 2.9549248747913186, + "grad_norm": 0.4174182116985321, + "learning_rate": 0.0002, + "loss": 1.7055, + "step": 1770 + }, + { + "epoch": 2.971619365609349, + "grad_norm": 0.43385711312294006, + "learning_rate": 0.0002, + "loss": 1.5767, + "step": 1780 + }, + { + "epoch": 2.988313856427379, + "grad_norm": 0.474332332611084, + "learning_rate": 0.0002, + "loss": 1.6215, + "step": 1790 + }, + { + "epoch": 3.0, + "eval_loss": 1.8456445932388306, + "eval_runtime": 87.6261, + "eval_samples_per_second": 5.877, + "eval_steps_per_second": 0.742, + "step": 1797 + }, + { + "epoch": 3.005008347245409, + "grad_norm": 0.40323764085769653, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 1800 + }, + { + "epoch": 3.021702838063439, + "grad_norm": 0.45069044828414917, + "learning_rate": 0.0002, + "loss": 1.5367, + "step": 1810 + }, + { + "epoch": 3.038397328881469, + "grad_norm": 0.6204925775527954, + "learning_rate": 0.0002, + "loss": 1.5271, + "step": 1820 + }, + { + "epoch": 3.0550918196994994, + "grad_norm": 0.5857783555984497, + "learning_rate": 0.0002, + "loss": 1.5056, + "step": 1830 + }, + { + "epoch": 3.0717863105175294, + "grad_norm": 0.6776524782180786, + "learning_rate": 0.0002, + "loss": 1.5137, + "step": 1840 + }, + { + "epoch": 3.0884808013355594, + "grad_norm": 0.5486199855804443, + "learning_rate": 0.0002, + "loss": 1.5106, + "step": 1850 + }, + { + "epoch": 3.1051752921535893, + "grad_norm": 0.5496503710746765, + "learning_rate": 0.0002, + "loss": 1.414, + "step": 1860 + }, + { + "epoch": 3.1218697829716193, + "grad_norm": 0.5602648258209229, + "learning_rate": 0.0002, + "loss": 1.5181, + "step": 1870 + }, + { + "epoch": 3.1385642737896493, + "grad_norm": 1.0697380304336548, + "learning_rate": 0.0002, + "loss": 1.5406, + "step": 1880 + }, + { + "epoch": 3.1552587646076793, + "grad_norm": 0.6087332367897034, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 1890 + }, + { + "epoch": 3.1719532554257097, + "grad_norm": 0.5112161040306091, + "learning_rate": 0.0002, + "loss": 1.5219, + "step": 1900 + }, + { + "epoch": 3.1886477462437397, + "grad_norm": 0.6393680572509766, + "learning_rate": 0.0002, + "loss": 1.5139, + "step": 1910 + }, + { + "epoch": 3.2053422370617697, + "grad_norm": 0.7201815247535706, + "learning_rate": 0.0002, + "loss": 1.5337, + "step": 1920 + }, + { + "epoch": 3.2220367278797997, + "grad_norm": 0.5856018662452698, + "learning_rate": 0.0002, + "loss": 1.6055, + "step": 1930 + }, + { + "epoch": 3.2387312186978297, + "grad_norm": 0.581247866153717, + "learning_rate": 0.0002, + "loss": 1.4791, + "step": 1940 + }, + { + "epoch": 3.2554257095158596, + "grad_norm": 0.6055102944374084, + "learning_rate": 0.0002, + "loss": 1.5395, + "step": 1950 + }, + { + "epoch": 3.27212020033389, + "grad_norm": 0.546894371509552, + "learning_rate": 0.0002, + "loss": 1.5086, + "step": 1960 + }, + { + "epoch": 3.28881469115192, + "grad_norm": 0.565558910369873, + "learning_rate": 0.0002, + "loss": 1.5712, + "step": 1970 + }, + { + "epoch": 3.30550918196995, + "grad_norm": 1.2238883972167969, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 1980 + }, + { + "epoch": 3.32220367278798, + "grad_norm": 0.6362585425376892, + "learning_rate": 0.0002, + "loss": 1.4655, + "step": 1990 + }, + { + "epoch": 3.33889816360601, + "grad_norm": 0.6131124496459961, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 2000 + }, + { + "epoch": 3.35559265442404, + "grad_norm": 0.5181341767311096, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2010 + }, + { + "epoch": 3.37228714524207, + "grad_norm": 0.6667609810829163, + "learning_rate": 0.0002, + "loss": 1.5039, + "step": 2020 + }, + { + "epoch": 3.3889816360601, + "grad_norm": 0.6488749980926514, + "learning_rate": 0.0002, + "loss": 1.5814, + "step": 2030 + }, + { + "epoch": 3.4056761268781304, + "grad_norm": 0.5693286061286926, + "learning_rate": 0.0002, + "loss": 1.5226, + "step": 2040 + }, + { + "epoch": 3.4223706176961604, + "grad_norm": 0.6154143810272217, + "learning_rate": 0.0002, + "loss": 1.5121, + "step": 2050 + }, + { + "epoch": 3.4390651085141903, + "grad_norm": 0.6747981309890747, + "learning_rate": 0.0002, + "loss": 1.6033, + "step": 2060 + }, + { + "epoch": 3.4557595993322203, + "grad_norm": 0.5494789481163025, + "learning_rate": 0.0002, + "loss": 1.5857, + "step": 2070 + }, + { + "epoch": 3.4724540901502503, + "grad_norm": 2.481968402862549, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2080 + }, + { + "epoch": 3.4891485809682803, + "grad_norm": 0.589784562587738, + "learning_rate": 0.0002, + "loss": 1.4989, + "step": 2090 + }, + { + "epoch": 3.5058430717863107, + "grad_norm": 0.6449820399284363, + "learning_rate": 0.0002, + "loss": 1.6227, + "step": 2100 + }, + { + "epoch": 3.5225375626043407, + "grad_norm": 0.6467038989067078, + "learning_rate": 0.0002, + "loss": 1.588, + "step": 2110 + }, + { + "epoch": 3.5392320534223707, + "grad_norm": 0.6533533334732056, + "learning_rate": 0.0002, + "loss": 1.5655, + "step": 2120 + }, + { + "epoch": 3.5559265442404007, + "grad_norm": 0.6804035902023315, + "learning_rate": 0.0002, + "loss": 1.6052, + "step": 2130 + }, + { + "epoch": 3.5726210350584306, + "grad_norm": 0.628773033618927, + "learning_rate": 0.0002, + "loss": 1.5408, + "step": 2140 + }, + { + "epoch": 3.5893155258764606, + "grad_norm": 0.6055739521980286, + "learning_rate": 0.0002, + "loss": 1.5487, + "step": 2150 + }, + { + "epoch": 3.6060100166944906, + "grad_norm": 0.6000894904136658, + "learning_rate": 0.0002, + "loss": 1.5305, + "step": 2160 + }, + { + "epoch": 3.6227045075125206, + "grad_norm": 0.5862473249435425, + "learning_rate": 0.0002, + "loss": 1.4742, + "step": 2170 + }, + { + "epoch": 3.639398998330551, + "grad_norm": 0.6547419428825378, + "learning_rate": 0.0002, + "loss": 1.503, + "step": 2180 + }, + { + "epoch": 3.656093489148581, + "grad_norm": 0.5610318779945374, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 2190 + }, + { + "epoch": 3.672787979966611, + "grad_norm": 0.6387564539909363, + "learning_rate": 0.0002, + "loss": 1.4814, + "step": 2200 + }, + { + "epoch": 3.689482470784641, + "grad_norm": 0.6065090894699097, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 2210 + }, + { + "epoch": 3.706176961602671, + "grad_norm": 0.6266646981239319, + "learning_rate": 0.0002, + "loss": 1.5074, + "step": 2220 + }, + { + "epoch": 3.7228714524207014, + "grad_norm": 0.626944363117218, + "learning_rate": 0.0002, + "loss": 1.5146, + "step": 2230 + }, + { + "epoch": 3.7395659432387314, + "grad_norm": 0.6043975949287415, + "learning_rate": 0.0002, + "loss": 1.5131, + "step": 2240 + }, + { + "epoch": 3.7562604340567614, + "grad_norm": 0.599732518196106, + "learning_rate": 0.0002, + "loss": 1.5929, + "step": 2250 + }, + { + "epoch": 3.7729549248747913, + "grad_norm": 0.6738389134407043, + "learning_rate": 0.0002, + "loss": 1.5236, + "step": 2260 + }, + { + "epoch": 3.7896494156928213, + "grad_norm": 0.5561335682868958, + "learning_rate": 0.0002, + "loss": 1.5003, + "step": 2270 + }, + { + "epoch": 3.8063439065108513, + "grad_norm": 0.6185726523399353, + "learning_rate": 0.0002, + "loss": 1.5013, + "step": 2280 + }, + { + "epoch": 3.8230383973288813, + "grad_norm": 0.6151532530784607, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 2290 + }, + { + "epoch": 3.8397328881469113, + "grad_norm": 0.5808233022689819, + "learning_rate": 0.0002, + "loss": 1.5453, + "step": 2300 + }, + { + "epoch": 3.8564273789649417, + "grad_norm": 0.6615163683891296, + "learning_rate": 0.0002, + "loss": 1.5223, + "step": 2310 + }, + { + "epoch": 3.8731218697829717, + "grad_norm": 0.5832979679107666, + "learning_rate": 0.0002, + "loss": 1.4365, + "step": 2320 + }, + { + "epoch": 3.8898163606010017, + "grad_norm": 0.6119300127029419, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2330 + }, + { + "epoch": 3.9065108514190316, + "grad_norm": 0.6489697694778442, + "learning_rate": 0.0002, + "loss": 1.5581, + "step": 2340 + }, + { + "epoch": 3.9232053422370616, + "grad_norm": 0.5539063215255737, + "learning_rate": 0.0002, + "loss": 1.5601, + "step": 2350 + }, + { + "epoch": 3.939899833055092, + "grad_norm": 0.6062877178192139, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2360 + }, + { + "epoch": 3.956594323873122, + "grad_norm": 0.680609941482544, + "learning_rate": 0.0002, + "loss": 1.5168, + "step": 2370 + }, + { + "epoch": 3.973288814691152, + "grad_norm": 0.6176834106445312, + "learning_rate": 0.0002, + "loss": 1.4875, + "step": 2380 + }, + { + "epoch": 3.989983305509182, + "grad_norm": 0.6538102030754089, + "learning_rate": 0.0002, + "loss": 1.4984, + "step": 2390 + }, + { + "epoch": 4.0, + "eval_loss": 1.8920671939849854, + "eval_runtime": 76.5227, + "eval_samples_per_second": 6.73, + "eval_steps_per_second": 0.849, + "step": 2396 + }, + { + "epoch": 4.006677796327212, + "grad_norm": 0.5683762431144714, + "learning_rate": 0.0002, + "loss": 1.3926, + "step": 2400 + }, + { + "epoch": 4.023372287145242, + "grad_norm": 0.6858044862747192, + "learning_rate": 0.0002, + "loss": 1.3387, + "step": 2410 + }, + { + "epoch": 4.040066777963272, + "grad_norm": 0.7614858150482178, + "learning_rate": 0.0002, + "loss": 1.4495, + "step": 2420 + }, + { + "epoch": 4.056761268781302, + "grad_norm": 0.709412693977356, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 2430 + }, + { + "epoch": 4.073455759599332, + "grad_norm": 0.7070785760879517, + "learning_rate": 0.0002, + "loss": 1.3836, + "step": 2440 + }, + { + "epoch": 4.090150250417362, + "grad_norm": 0.8815216422080994, + "learning_rate": 0.0002, + "loss": 1.3527, + "step": 2450 + }, + { + "epoch": 4.106844741235392, + "grad_norm": 0.759981632232666, + "learning_rate": 0.0002, + "loss": 1.3731, + "step": 2460 + }, + { + "epoch": 4.123539232053423, + "grad_norm": 0.6715240478515625, + "learning_rate": 0.0002, + "loss": 1.3393, + "step": 2470 + }, + { + "epoch": 4.140233722871453, + "grad_norm": 0.7503564953804016, + "learning_rate": 0.0002, + "loss": 1.3934, + "step": 2480 + }, + { + "epoch": 4.156928213689483, + "grad_norm": 0.773743748664856, + "learning_rate": 0.0002, + "loss": 1.324, + "step": 2490 + }, + { + "epoch": 4.173622704507513, + "grad_norm": 0.8850100040435791, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 2500 + }, + { + "epoch": 4.190317195325543, + "grad_norm": 0.7575962543487549, + "learning_rate": 0.0002, + "loss": 1.3183, + "step": 2510 + }, + { + "epoch": 4.207011686143573, + "grad_norm": 0.9117498397827148, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 2520 + }, + { + "epoch": 4.223706176961603, + "grad_norm": 0.7637559175491333, + "learning_rate": 0.0002, + "loss": 1.3242, + "step": 2530 + }, + { + "epoch": 4.240400667779633, + "grad_norm": 0.8178390264511108, + "learning_rate": 0.0002, + "loss": 1.3764, + "step": 2540 + }, + { + "epoch": 4.257095158597663, + "grad_norm": 0.8299263119697571, + "learning_rate": 0.0002, + "loss": 1.3808, + "step": 2550 + }, + { + "epoch": 4.273789649415693, + "grad_norm": 0.7238091230392456, + "learning_rate": 0.0002, + "loss": 1.3637, + "step": 2560 + }, + { + "epoch": 4.290484140233723, + "grad_norm": 0.7468036413192749, + "learning_rate": 0.0002, + "loss": 1.349, + "step": 2570 + }, + { + "epoch": 4.307178631051753, + "grad_norm": 0.8012791275978088, + "learning_rate": 0.0002, + "loss": 1.4422, + "step": 2580 + }, + { + "epoch": 4.323873121869783, + "grad_norm": 0.8302484154701233, + "learning_rate": 0.0002, + "loss": 1.3723, + "step": 2590 + }, + { + "epoch": 4.340567612687813, + "grad_norm": 0.751864492893219, + "learning_rate": 0.0002, + "loss": 1.4013, + "step": 2600 + }, + { + "epoch": 4.357262103505843, + "grad_norm": 0.8025410175323486, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 2610 + }, + { + "epoch": 4.373956594323873, + "grad_norm": 0.7700799703598022, + "learning_rate": 0.0002, + "loss": 1.3831, + "step": 2620 + }, + { + "epoch": 4.390651085141903, + "grad_norm": 0.8526890873908997, + "learning_rate": 0.0002, + "loss": 1.3721, + "step": 2630 + }, + { + "epoch": 4.407345575959933, + "grad_norm": 1.0536625385284424, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2640 + }, + { + "epoch": 4.424040066777963, + "grad_norm": 0.7223818898200989, + "learning_rate": 0.0002, + "loss": 1.3736, + "step": 2650 + }, + { + "epoch": 4.440734557595993, + "grad_norm": 0.7981253266334534, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2660 + }, + { + "epoch": 4.457429048414023, + "grad_norm": 0.7136162519454956, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 2670 + }, + { + "epoch": 4.474123539232053, + "grad_norm": 0.8008312582969666, + "learning_rate": 0.0002, + "loss": 1.4242, + "step": 2680 + }, + { + "epoch": 4.490818030050083, + "grad_norm": 0.7924065589904785, + "learning_rate": 0.0002, + "loss": 1.3448, + "step": 2690 + }, + { + "epoch": 4.507512520868113, + "grad_norm": 0.8224287629127502, + "learning_rate": 0.0002, + "loss": 1.402, + "step": 2700 + }, + { + "epoch": 4.524207011686143, + "grad_norm": 0.7494375109672546, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 2710 + }, + { + "epoch": 4.540901502504173, + "grad_norm": 0.8097899556159973, + "learning_rate": 0.0002, + "loss": 1.4471, + "step": 2720 + }, + { + "epoch": 4.557595993322204, + "grad_norm": 0.7728819251060486, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 2730 + }, + { + "epoch": 4.574290484140234, + "grad_norm": 0.9112362265586853, + "learning_rate": 0.0002, + "loss": 1.3549, + "step": 2740 + }, + { + "epoch": 4.590984974958264, + "grad_norm": 0.7502672076225281, + "learning_rate": 0.0002, + "loss": 1.4601, + "step": 2750 + }, + { + "epoch": 4.607679465776294, + "grad_norm": 0.8816406726837158, + "learning_rate": 0.0002, + "loss": 1.4216, + "step": 2760 + }, + { + "epoch": 4.624373956594324, + "grad_norm": 0.7117180228233337, + "learning_rate": 0.0002, + "loss": 1.3233, + "step": 2770 + }, + { + "epoch": 4.641068447412354, + "grad_norm": 0.8224529027938843, + "learning_rate": 0.0002, + "loss": 1.3886, + "step": 2780 + }, + { + "epoch": 4.657762938230384, + "grad_norm": 0.7625266313552856, + "learning_rate": 0.0002, + "loss": 1.3756, + "step": 2790 + }, + { + "epoch": 4.674457429048414, + "grad_norm": 0.7754318118095398, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 2800 + }, + { + "epoch": 4.691151919866444, + "grad_norm": 0.7907336354255676, + "learning_rate": 0.0002, + "loss": 1.4102, + "step": 2810 + }, + { + "epoch": 4.707846410684474, + "grad_norm": 0.7377734780311584, + "learning_rate": 0.0002, + "loss": 1.3277, + "step": 2820 + }, + { + "epoch": 4.724540901502504, + "grad_norm": 0.7380456328392029, + "learning_rate": 0.0002, + "loss": 1.3686, + "step": 2830 + }, + { + "epoch": 4.741235392320534, + "grad_norm": 0.7148023247718811, + "learning_rate": 0.0002, + "loss": 1.4405, + "step": 2840 + }, + { + "epoch": 4.757929883138564, + "grad_norm": 0.807048499584198, + "learning_rate": 0.0002, + "loss": 1.4025, + "step": 2850 + }, + { + "epoch": 4.774624373956595, + "grad_norm": 0.8444154858589172, + "learning_rate": 0.0002, + "loss": 1.3195, + "step": 2860 + }, + { + "epoch": 4.791318864774624, + "grad_norm": 0.8328704237937927, + "learning_rate": 0.0002, + "loss": 1.4282, + "step": 2870 + }, + { + "epoch": 4.808013355592655, + "grad_norm": 0.89827960729599, + "learning_rate": 0.0002, + "loss": 1.413, + "step": 2880 + }, + { + "epoch": 4.824707846410685, + "grad_norm": 0.7848225831985474, + "learning_rate": 0.0002, + "loss": 1.4488, + "step": 2890 + }, + { + "epoch": 4.841402337228715, + "grad_norm": 0.703802227973938, + "learning_rate": 0.0002, + "loss": 1.3757, + "step": 2900 + }, + { + "epoch": 4.858096828046745, + "grad_norm": 0.8092581629753113, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 2910 + }, + { + "epoch": 4.874791318864775, + "grad_norm": 0.7537722587585449, + "learning_rate": 0.0002, + "loss": 1.3812, + "step": 2920 + }, + { + "epoch": 4.891485809682805, + "grad_norm": 0.7966470122337341, + "learning_rate": 0.0002, + "loss": 1.4499, + "step": 2930 + }, + { + "epoch": 4.908180300500835, + "grad_norm": 0.7860329747200012, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 2940 + }, + { + "epoch": 4.924874791318865, + "grad_norm": 0.7964439988136292, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 2950 + }, + { + "epoch": 4.941569282136895, + "grad_norm": 0.740288257598877, + "learning_rate": 0.0002, + "loss": 1.3869, + "step": 2960 + }, + { + "epoch": 4.958263772954925, + "grad_norm": 0.7377685904502869, + "learning_rate": 0.0002, + "loss": 1.4321, + "step": 2970 + }, + { + "epoch": 4.974958263772955, + "grad_norm": 0.793484628200531, + "learning_rate": 0.0002, + "loss": 1.4253, + "step": 2980 + }, + { + "epoch": 4.9916527545909855, + "grad_norm": 0.7710573077201843, + "learning_rate": 0.0002, + "loss": 1.3966, + "step": 2990 + }, + { + "epoch": 5.0, + "eval_loss": 1.9764225482940674, + "eval_runtime": 87.968, + "eval_samples_per_second": 5.854, + "eval_steps_per_second": 0.739, + "step": 2995 + }, + { + "epoch": 5.008347245409015, + "grad_norm": 0.680841326713562, + "learning_rate": 0.0002, + "loss": 1.3493, + "step": 3000 + }, + { + "epoch": 5.025041736227045, + "grad_norm": 0.8790825009346008, + "learning_rate": 0.0002, + "loss": 1.2462, + "step": 3010 + }, + { + "epoch": 5.041736227045075, + "grad_norm": 1.1519404649734497, + "learning_rate": 0.0002, + "loss": 1.2514, + "step": 3020 + }, + { + "epoch": 5.058430717863105, + "grad_norm": 1.1939337253570557, + "learning_rate": 0.0002, + "loss": 1.224, + "step": 3030 + }, + { + "epoch": 5.075125208681135, + "grad_norm": 1.1471049785614014, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 3040 + }, + { + "epoch": 5.091819699499165, + "grad_norm": 1.0808285474777222, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 3050 + }, + { + "epoch": 5.108514190317195, + "grad_norm": 1.0102492570877075, + "learning_rate": 0.0002, + "loss": 1.1644, + "step": 3060 + }, + { + "epoch": 5.125208681135225, + "grad_norm": 0.9869397282600403, + "learning_rate": 0.0002, + "loss": 1.1652, + "step": 3070 + }, + { + "epoch": 5.141903171953255, + "grad_norm": 0.9689525365829468, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 3080 + }, + { + "epoch": 5.158597662771285, + "grad_norm": 0.9293769598007202, + "learning_rate": 0.0002, + "loss": 1.1747, + "step": 3090 + }, + { + "epoch": 5.175292153589315, + "grad_norm": 0.9289103150367737, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 3100 + }, + { + "epoch": 5.191986644407345, + "grad_norm": 0.9736173152923584, + "learning_rate": 0.0002, + "loss": 1.2538, + "step": 3110 + }, + { + "epoch": 5.208681135225375, + "grad_norm": 1.3144289255142212, + "learning_rate": 0.0002, + "loss": 1.2429, + "step": 3120 + }, + { + "epoch": 5.225375626043405, + "grad_norm": 0.95982825756073, + "learning_rate": 0.0002, + "loss": 1.2107, + "step": 3130 + }, + { + "epoch": 5.242070116861436, + "grad_norm": 0.903189480304718, + "learning_rate": 0.0002, + "loss": 1.2239, + "step": 3140 + }, + { + "epoch": 5.258764607679466, + "grad_norm": 1.056692123413086, + "learning_rate": 0.0002, + "loss": 1.2663, + "step": 3150 + }, + { + "epoch": 5.275459098497496, + "grad_norm": 1.1169359683990479, + "learning_rate": 0.0002, + "loss": 1.2955, + "step": 3160 + }, + { + "epoch": 5.292153589315526, + "grad_norm": 1.2178374528884888, + "learning_rate": 0.0002, + "loss": 1.1559, + "step": 3170 + }, + { + "epoch": 5.308848080133556, + "grad_norm": 0.9956373572349548, + "learning_rate": 0.0002, + "loss": 1.2394, + "step": 3180 + }, + { + "epoch": 5.325542570951586, + "grad_norm": 0.959555447101593, + "learning_rate": 0.0002, + "loss": 1.1792, + "step": 3190 + }, + { + "epoch": 5.342237061769616, + "grad_norm": 0.9343846440315247, + "learning_rate": 0.0002, + "loss": 1.1817, + "step": 3200 + }, + { + "epoch": 5.358931552587646, + "grad_norm": 0.8806524872779846, + "learning_rate": 0.0002, + "loss": 1.2033, + "step": 3210 + }, + { + "epoch": 5.375626043405676, + "grad_norm": 0.9477803111076355, + "learning_rate": 0.0002, + "loss": 1.2511, + "step": 3220 + }, + { + "epoch": 5.392320534223706, + "grad_norm": 0.9975674152374268, + "learning_rate": 0.0002, + "loss": 1.2011, + "step": 3230 + }, + { + "epoch": 5.409015025041736, + "grad_norm": 0.9650071263313293, + "learning_rate": 0.0002, + "loss": 1.3012, + "step": 3240 + }, + { + "epoch": 5.425709515859766, + "grad_norm": 1.0170838832855225, + "learning_rate": 0.0002, + "loss": 1.2281, + "step": 3250 + }, + { + "epoch": 5.442404006677796, + "grad_norm": 1.158118486404419, + "learning_rate": 0.0002, + "loss": 1.2635, + "step": 3260 + }, + { + "epoch": 5.459098497495827, + "grad_norm": 1.0228497982025146, + "learning_rate": 0.0002, + "loss": 1.3333, + "step": 3270 + }, + { + "epoch": 5.475792988313857, + "grad_norm": 1.0101768970489502, + "learning_rate": 0.0002, + "loss": 1.1961, + "step": 3280 + }, + { + "epoch": 5.492487479131887, + "grad_norm": 1.0407295227050781, + "learning_rate": 0.0002, + "loss": 1.3058, + "step": 3290 + }, + { + "epoch": 5.509181969949917, + "grad_norm": 0.9337932467460632, + "learning_rate": 0.0002, + "loss": 1.2062, + "step": 3300 + }, + { + "epoch": 5.525876460767947, + "grad_norm": 1.0305527448654175, + "learning_rate": 0.0002, + "loss": 1.2241, + "step": 3310 + }, + { + "epoch": 5.542570951585977, + "grad_norm": 1.0523453950881958, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 3320 + }, + { + "epoch": 5.559265442404007, + "grad_norm": 0.9707391858100891, + "learning_rate": 0.0002, + "loss": 1.2526, + "step": 3330 + }, + { + "epoch": 5.575959933222037, + "grad_norm": 1.0054972171783447, + "learning_rate": 0.0002, + "loss": 1.3002, + "step": 3340 + }, + { + "epoch": 5.592654424040067, + "grad_norm": 1.0393340587615967, + "learning_rate": 0.0002, + "loss": 1.2459, + "step": 3350 + }, + { + "epoch": 5.609348914858097, + "grad_norm": 1.0671277046203613, + "learning_rate": 0.0002, + "loss": 1.2328, + "step": 3360 + }, + { + "epoch": 5.626043405676127, + "grad_norm": 1.0725873708724976, + "learning_rate": 0.0002, + "loss": 1.2415, + "step": 3370 + }, + { + "epoch": 5.642737896494157, + "grad_norm": 0.9844746589660645, + "learning_rate": 0.0002, + "loss": 1.2475, + "step": 3380 + }, + { + "epoch": 5.659432387312187, + "grad_norm": 0.9659736752510071, + "learning_rate": 0.0002, + "loss": 1.1997, + "step": 3390 + }, + { + "epoch": 5.676126878130217, + "grad_norm": 0.9152608513832092, + "learning_rate": 0.0002, + "loss": 1.2426, + "step": 3400 + }, + { + "epoch": 5.692821368948247, + "grad_norm": 0.9759509563446045, + "learning_rate": 0.0002, + "loss": 1.2424, + "step": 3410 + }, + { + "epoch": 5.709515859766277, + "grad_norm": 1.0662057399749756, + "learning_rate": 0.0002, + "loss": 1.2264, + "step": 3420 + }, + { + "epoch": 5.726210350584307, + "grad_norm": 0.9780185222625732, + "learning_rate": 0.0002, + "loss": 1.19, + "step": 3430 + }, + { + "epoch": 5.742904841402337, + "grad_norm": 0.9781617522239685, + "learning_rate": 0.0002, + "loss": 1.2603, + "step": 3440 + }, + { + "epoch": 5.759599332220367, + "grad_norm": 1.0790785551071167, + "learning_rate": 0.0002, + "loss": 1.2472, + "step": 3450 + }, + { + "epoch": 5.776293823038397, + "grad_norm": 1.0573410987854004, + "learning_rate": 0.0002, + "loss": 1.2697, + "step": 3460 + }, + { + "epoch": 5.792988313856427, + "grad_norm": 0.9953364729881287, + "learning_rate": 0.0002, + "loss": 1.2591, + "step": 3470 + }, + { + "epoch": 5.809682804674457, + "grad_norm": 1.0072667598724365, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 3480 + }, + { + "epoch": 5.826377295492487, + "grad_norm": 0.9312750697135925, + "learning_rate": 0.0002, + "loss": 1.286, + "step": 3490 + }, + { + "epoch": 5.843071786310517, + "grad_norm": 1.059614896774292, + "learning_rate": 0.0002, + "loss": 1.2379, + "step": 3500 + }, + { + "epoch": 5.859766277128547, + "grad_norm": 1.2089484930038452, + "learning_rate": 0.0002, + "loss": 1.2323, + "step": 3510 + }, + { + "epoch": 5.876460767946577, + "grad_norm": 1.0740607976913452, + "learning_rate": 0.0002, + "loss": 1.2047, + "step": 3520 + }, + { + "epoch": 5.893155258764608, + "grad_norm": 0.9620149731636047, + "learning_rate": 0.0002, + "loss": 1.2809, + "step": 3530 + }, + { + "epoch": 5.909849749582638, + "grad_norm": 1.0482431650161743, + "learning_rate": 0.0002, + "loss": 1.238, + "step": 3540 + }, + { + "epoch": 5.926544240400668, + "grad_norm": 0.9137503504753113, + "learning_rate": 0.0002, + "loss": 1.2621, + "step": 3550 + }, + { + "epoch": 5.943238731218698, + "grad_norm": 1.1599403619766235, + "learning_rate": 0.0002, + "loss": 1.3066, + "step": 3560 + }, + { + "epoch": 5.959933222036728, + "grad_norm": 0.911613404750824, + "learning_rate": 0.0002, + "loss": 1.2556, + "step": 3570 + }, + { + "epoch": 5.976627712854758, + "grad_norm": 0.9120033383369446, + "learning_rate": 0.0002, + "loss": 1.2746, + "step": 3580 + }, + { + "epoch": 5.993322203672788, + "grad_norm": 1.0588736534118652, + "learning_rate": 0.0002, + "loss": 1.2815, + "step": 3590 + }, + { + "epoch": 6.0, + "eval_loss": 2.0921614170074463, + "eval_runtime": 71.974, + "eval_samples_per_second": 7.155, + "eval_steps_per_second": 0.903, + "step": 3594 + }, + { + "epoch": 6.010016694490818, + "grad_norm": 0.9213348627090454, + "learning_rate": 0.0002, + "loss": 1.1397, + "step": 3600 + }, + { + "epoch": 6.026711185308848, + "grad_norm": 1.137640357017517, + "learning_rate": 0.0002, + "loss": 1.07, + "step": 3610 + }, + { + "epoch": 6.043405676126878, + "grad_norm": 1.200276494026184, + "learning_rate": 0.0002, + "loss": 0.9953, + "step": 3620 + }, + { + "epoch": 6.060100166944908, + "grad_norm": 1.335649013519287, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 3630 + }, + { + "epoch": 6.076794657762938, + "grad_norm": 1.1353906393051147, + "learning_rate": 0.0002, + "loss": 1.1154, + "step": 3640 + }, + { + "epoch": 6.093489148580968, + "grad_norm": 1.0406795740127563, + "learning_rate": 0.0002, + "loss": 1.0481, + "step": 3650 + }, + { + "epoch": 6.110183639398999, + "grad_norm": 1.2691017389297485, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 3660 + }, + { + "epoch": 6.126878130217029, + "grad_norm": 1.3334898948669434, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 3670 + }, + { + "epoch": 6.143572621035059, + "grad_norm": 1.1766020059585571, + "learning_rate": 0.0002, + "loss": 1.0186, + "step": 3680 + }, + { + "epoch": 6.160267111853089, + "grad_norm": 1.1079157590866089, + "learning_rate": 0.0002, + "loss": 1.0431, + "step": 3690 + }, + { + "epoch": 6.176961602671119, + "grad_norm": 1.4312299489974976, + "learning_rate": 0.0002, + "loss": 1.0395, + "step": 3700 + }, + { + "epoch": 6.193656093489149, + "grad_norm": 1.2636224031448364, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 3710 + }, + { + "epoch": 6.210350584307179, + "grad_norm": 1.1957253217697144, + "learning_rate": 0.0002, + "loss": 1.0669, + "step": 3720 + }, + { + "epoch": 6.227045075125209, + "grad_norm": 1.1044131517410278, + "learning_rate": 0.0002, + "loss": 1.0199, + "step": 3730 + }, + { + "epoch": 6.243739565943239, + "grad_norm": 1.2045193910598755, + "learning_rate": 0.0002, + "loss": 1.0316, + "step": 3740 + }, + { + "epoch": 6.260434056761269, + "grad_norm": 1.0740957260131836, + "learning_rate": 0.0002, + "loss": 1.1218, + "step": 3750 + }, + { + "epoch": 6.277128547579299, + "grad_norm": 1.1548833847045898, + "learning_rate": 0.0002, + "loss": 1.0271, + "step": 3760 + }, + { + "epoch": 6.293823038397329, + "grad_norm": 1.257440209388733, + "learning_rate": 0.0002, + "loss": 1.14, + "step": 3770 + }, + { + "epoch": 6.310517529215359, + "grad_norm": 1.1988940238952637, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 3780 + }, + { + "epoch": 6.3272120200333895, + "grad_norm": 1.1707229614257812, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 3790 + }, + { + "epoch": 6.343906510851419, + "grad_norm": 1.360107660293579, + "learning_rate": 0.0002, + "loss": 1.053, + "step": 3800 + }, + { + "epoch": 6.360601001669449, + "grad_norm": 1.249742031097412, + "learning_rate": 0.0002, + "loss": 1.0637, + "step": 3810 + }, + { + "epoch": 6.377295492487479, + "grad_norm": 1.2729560136795044, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 3820 + }, + { + "epoch": 6.393989983305509, + "grad_norm": 1.241761565208435, + "learning_rate": 0.0002, + "loss": 1.1217, + "step": 3830 + }, + { + "epoch": 6.410684474123539, + "grad_norm": 1.1892873048782349, + "learning_rate": 0.0002, + "loss": 1.0648, + "step": 3840 + }, + { + "epoch": 6.427378964941569, + "grad_norm": 1.1766357421875, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 3850 + }, + { + "epoch": 6.444073455759599, + "grad_norm": 1.2642168998718262, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 3860 + }, + { + "epoch": 6.460767946577629, + "grad_norm": 1.3390182256698608, + "learning_rate": 0.0002, + "loss": 1.0748, + "step": 3870 + }, + { + "epoch": 6.477462437395659, + "grad_norm": 1.183168649673462, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 3880 + }, + { + "epoch": 6.494156928213689, + "grad_norm": 1.1458892822265625, + "learning_rate": 0.0002, + "loss": 1.0696, + "step": 3890 + }, + { + "epoch": 6.510851419031719, + "grad_norm": 1.2736095190048218, + "learning_rate": 0.0002, + "loss": 1.1625, + "step": 3900 + }, + { + "epoch": 6.527545909849749, + "grad_norm": 1.323607087135315, + "learning_rate": 0.0002, + "loss": 1.1175, + "step": 3910 + }, + { + "epoch": 6.54424040066778, + "grad_norm": 1.2177817821502686, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 3920 + }, + { + "epoch": 6.560934891485809, + "grad_norm": 1.3270750045776367, + "learning_rate": 0.0002, + "loss": 1.0333, + "step": 3930 + }, + { + "epoch": 6.57762938230384, + "grad_norm": 1.0974372625350952, + "learning_rate": 0.0002, + "loss": 1.0589, + "step": 3940 + }, + { + "epoch": 6.59432387312187, + "grad_norm": 1.3352670669555664, + "learning_rate": 0.0002, + "loss": 1.1347, + "step": 3950 + }, + { + "epoch": 6.6110183639399, + "grad_norm": 1.3174126148223877, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 3960 + }, + { + "epoch": 6.62771285475793, + "grad_norm": 1.1783626079559326, + "learning_rate": 0.0002, + "loss": 1.1697, + "step": 3970 + }, + { + "epoch": 6.64440734557596, + "grad_norm": 1.1886446475982666, + "learning_rate": 0.0002, + "loss": 1.1256, + "step": 3980 + }, + { + "epoch": 6.66110183639399, + "grad_norm": 1.2215187549591064, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 3990 + }, + { + "epoch": 6.67779632721202, + "grad_norm": 1.0320725440979004, + "learning_rate": 0.0002, + "loss": 1.1236, + "step": 4000 + }, + { + "epoch": 6.69449081803005, + "grad_norm": 1.340338110923767, + "learning_rate": 0.0002, + "loss": 1.0828, + "step": 4010 + }, + { + "epoch": 6.71118530884808, + "grad_norm": 1.1496273279190063, + "learning_rate": 0.0002, + "loss": 1.0942, + "step": 4020 + }, + { + "epoch": 6.72787979966611, + "grad_norm": 1.5720409154891968, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 4030 + }, + { + "epoch": 6.74457429048414, + "grad_norm": 1.497376799583435, + "learning_rate": 0.0002, + "loss": 1.1385, + "step": 4040 + }, + { + "epoch": 6.76126878130217, + "grad_norm": 1.1594456434249878, + "learning_rate": 0.0002, + "loss": 1.0808, + "step": 4050 + }, + { + "epoch": 6.7779632721202, + "grad_norm": 1.326546549797058, + "learning_rate": 0.0002, + "loss": 1.1541, + "step": 4060 + }, + { + "epoch": 6.794657762938231, + "grad_norm": 1.18723726272583, + "learning_rate": 0.0002, + "loss": 1.1314, + "step": 4070 + }, + { + "epoch": 6.811352253756261, + "grad_norm": 1.2974154949188232, + "learning_rate": 0.0002, + "loss": 1.1906, + "step": 4080 + }, + { + "epoch": 6.828046744574291, + "grad_norm": 1.207748532295227, + "learning_rate": 0.0002, + "loss": 1.0534, + "step": 4090 + }, + { + "epoch": 6.844741235392321, + "grad_norm": 1.2398537397384644, + "learning_rate": 0.0002, + "loss": 1.0951, + "step": 4100 + }, + { + "epoch": 6.861435726210351, + "grad_norm": 1.1657508611679077, + "learning_rate": 0.0002, + "loss": 1.1348, + "step": 4110 + }, + { + "epoch": 6.878130217028381, + "grad_norm": 1.1986382007598877, + "learning_rate": 0.0002, + "loss": 1.1315, + "step": 4120 + }, + { + "epoch": 6.894824707846411, + "grad_norm": 1.407080054283142, + "learning_rate": 0.0002, + "loss": 1.0781, + "step": 4130 + }, + { + "epoch": 6.911519198664441, + "grad_norm": 1.0725297927856445, + "learning_rate": 0.0002, + "loss": 1.0515, + "step": 4140 + }, + { + "epoch": 6.928213689482471, + "grad_norm": 1.2659991979599, + "learning_rate": 0.0002, + "loss": 1.1602, + "step": 4150 + }, + { + "epoch": 6.944908180300501, + "grad_norm": 1.0579404830932617, + "learning_rate": 0.0002, + "loss": 1.1373, + "step": 4160 + }, + { + "epoch": 6.961602671118531, + "grad_norm": 1.254502296447754, + "learning_rate": 0.0002, + "loss": 1.1441, + "step": 4170 + }, + { + "epoch": 6.978297161936561, + "grad_norm": 1.2666021585464478, + "learning_rate": 0.0002, + "loss": 1.1019, + "step": 4180 + }, + { + "epoch": 6.994991652754591, + "grad_norm": 1.236793041229248, + "learning_rate": 0.0002, + "loss": 1.0675, + "step": 4190 + }, + { + "epoch": 7.0, + "eval_loss": 2.211871862411499, + "eval_runtime": 56.9215, + "eval_samples_per_second": 9.048, + "eval_steps_per_second": 1.142, + "step": 4193 + }, + { + "epoch": 7.011686143572621, + "grad_norm": 1.8114486932754517, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 4200 + }, + { + "epoch": 7.028380634390651, + "grad_norm": 2.062814235687256, + "learning_rate": 0.0002, + "loss": 0.8505, + "step": 4210 + }, + { + "epoch": 7.045075125208681, + "grad_norm": 1.4835841655731201, + "learning_rate": 0.0002, + "loss": 0.8606, + "step": 4220 + }, + { + "epoch": 7.061769616026711, + "grad_norm": 1.3040175437927246, + "learning_rate": 0.0002, + "loss": 0.862, + "step": 4230 + }, + { + "epoch": 7.078464106844741, + "grad_norm": 1.3654398918151855, + "learning_rate": 0.0002, + "loss": 0.9513, + "step": 4240 + }, + { + "epoch": 7.095158597662771, + "grad_norm": 1.3989132642745972, + "learning_rate": 0.0002, + "loss": 0.9272, + "step": 4250 + }, + { + "epoch": 7.111853088480801, + "grad_norm": 1.2168488502502441, + "learning_rate": 0.0002, + "loss": 0.9062, + "step": 4260 + }, + { + "epoch": 7.128547579298831, + "grad_norm": 1.52049720287323, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 4270 + }, + { + "epoch": 7.145242070116861, + "grad_norm": 1.4944370985031128, + "learning_rate": 0.0002, + "loss": 0.8486, + "step": 4280 + }, + { + "epoch": 7.161936560934891, + "grad_norm": 1.4657515287399292, + "learning_rate": 0.0002, + "loss": 0.8757, + "step": 4290 + }, + { + "epoch": 7.178631051752921, + "grad_norm": 1.373306155204773, + "learning_rate": 0.0002, + "loss": 0.9209, + "step": 4300 + }, + { + "epoch": 7.195325542570951, + "grad_norm": 1.3957229852676392, + "learning_rate": 0.0002, + "loss": 0.964, + "step": 4310 + }, + { + "epoch": 7.212020033388981, + "grad_norm": 1.3072983026504517, + "learning_rate": 0.0002, + "loss": 0.8777, + "step": 4320 + }, + { + "epoch": 7.228714524207012, + "grad_norm": 1.3311468362808228, + "learning_rate": 0.0002, + "loss": 0.8515, + "step": 4330 + }, + { + "epoch": 7.245409015025042, + "grad_norm": 1.3969240188598633, + "learning_rate": 0.0002, + "loss": 0.9641, + "step": 4340 + }, + { + "epoch": 7.262103505843072, + "grad_norm": 1.496384859085083, + "learning_rate": 0.0002, + "loss": 0.8767, + "step": 4350 + }, + { + "epoch": 7.278797996661102, + "grad_norm": 1.38449227809906, + "learning_rate": 0.0002, + "loss": 0.9378, + "step": 4360 + }, + { + "epoch": 7.295492487479132, + "grad_norm": 1.397478699684143, + "learning_rate": 0.0002, + "loss": 0.9299, + "step": 4370 + }, + { + "epoch": 7.312186978297162, + "grad_norm": 1.234455943107605, + "learning_rate": 0.0002, + "loss": 0.9067, + "step": 4380 + }, + { + "epoch": 7.328881469115192, + "grad_norm": 1.3813334703445435, + "learning_rate": 0.0002, + "loss": 0.9761, + "step": 4390 + }, + { + "epoch": 7.345575959933222, + "grad_norm": 1.3944685459136963, + "learning_rate": 0.0002, + "loss": 0.8766, + "step": 4400 + }, + { + "epoch": 7.362270450751252, + "grad_norm": 1.5999382734298706, + "learning_rate": 0.0002, + "loss": 0.9164, + "step": 4410 + }, + { + "epoch": 7.378964941569282, + "grad_norm": 1.753442406654358, + "learning_rate": 0.0002, + "loss": 0.9286, + "step": 4420 + }, + { + "epoch": 7.395659432387312, + "grad_norm": 1.4564250707626343, + "learning_rate": 0.0002, + "loss": 0.9248, + "step": 4430 + }, + { + "epoch": 7.412353923205342, + "grad_norm": 1.488957166671753, + "learning_rate": 0.0002, + "loss": 0.9011, + "step": 4440 + }, + { + "epoch": 7.429048414023372, + "grad_norm": 1.5810562372207642, + "learning_rate": 0.0002, + "loss": 0.9268, + "step": 4450 + }, + { + "epoch": 7.445742904841403, + "grad_norm": 1.2961808443069458, + "learning_rate": 0.0002, + "loss": 0.9033, + "step": 4460 + }, + { + "epoch": 7.462437395659433, + "grad_norm": 1.4854587316513062, + "learning_rate": 0.0002, + "loss": 0.951, + "step": 4470 + }, + { + "epoch": 7.479131886477463, + "grad_norm": 1.5555771589279175, + "learning_rate": 0.0002, + "loss": 0.9627, + "step": 4480 + }, + { + "epoch": 7.495826377295493, + "grad_norm": 1.5276654958724976, + "learning_rate": 0.0002, + "loss": 0.952, + "step": 4490 + }, + { + "epoch": 7.512520868113523, + "grad_norm": 1.4847941398620605, + "learning_rate": 0.0002, + "loss": 0.9679, + "step": 4500 + }, + { + "epoch": 7.529215358931553, + "grad_norm": 1.4122779369354248, + "learning_rate": 0.0002, + "loss": 0.9613, + "step": 4510 + }, + { + "epoch": 7.545909849749583, + "grad_norm": 1.497211217880249, + "learning_rate": 0.0002, + "loss": 0.9882, + "step": 4520 + }, + { + "epoch": 7.562604340567613, + "grad_norm": 1.4892537593841553, + "learning_rate": 0.0002, + "loss": 0.9778, + "step": 4530 + }, + { + "epoch": 7.579298831385643, + "grad_norm": 1.2664510011672974, + "learning_rate": 0.0002, + "loss": 0.9743, + "step": 4540 + }, + { + "epoch": 7.595993322203673, + "grad_norm": 1.4286391735076904, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 4550 + }, + { + "epoch": 7.612687813021703, + "grad_norm": 1.4727665185928345, + "learning_rate": 0.0002, + "loss": 0.995, + "step": 4560 + }, + { + "epoch": 7.629382303839733, + "grad_norm": 1.4128608703613281, + "learning_rate": 0.0002, + "loss": 0.8821, + "step": 4570 + }, + { + "epoch": 7.646076794657763, + "grad_norm": 1.4077776670455933, + "learning_rate": 0.0002, + "loss": 0.9227, + "step": 4580 + }, + { + "epoch": 7.6627712854757934, + "grad_norm": 1.760135293006897, + "learning_rate": 0.0002, + "loss": 0.9552, + "step": 4590 + }, + { + "epoch": 7.6794657762938225, + "grad_norm": 1.450317144393921, + "learning_rate": 0.0002, + "loss": 0.9896, + "step": 4600 + }, + { + "epoch": 7.696160267111853, + "grad_norm": 1.445032000541687, + "learning_rate": 0.0002, + "loss": 0.9701, + "step": 4610 + }, + { + "epoch": 7.712854757929883, + "grad_norm": 1.3218955993652344, + "learning_rate": 0.0002, + "loss": 0.975, + "step": 4620 + }, + { + "epoch": 7.729549248747913, + "grad_norm": 1.3336185216903687, + "learning_rate": 0.0002, + "loss": 0.9947, + "step": 4630 + }, + { + "epoch": 7.746243739565943, + "grad_norm": 1.3436596393585205, + "learning_rate": 0.0002, + "loss": 0.9918, + "step": 4640 + }, + { + "epoch": 7.762938230383973, + "grad_norm": 1.4396946430206299, + "learning_rate": 0.0002, + "loss": 1.0155, + "step": 4650 + }, + { + "epoch": 7.779632721202003, + "grad_norm": 1.5268234014511108, + "learning_rate": 0.0002, + "loss": 0.9928, + "step": 4660 + }, + { + "epoch": 7.796327212020033, + "grad_norm": 1.3981901407241821, + "learning_rate": 0.0002, + "loss": 0.9871, + "step": 4670 + }, + { + "epoch": 7.813021702838063, + "grad_norm": 1.6962796449661255, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 4680 + }, + { + "epoch": 7.829716193656093, + "grad_norm": 1.4803595542907715, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 4690 + }, + { + "epoch": 7.846410684474123, + "grad_norm": 1.4438055753707886, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 4700 + }, + { + "epoch": 7.863105175292153, + "grad_norm": 1.2435152530670166, + "learning_rate": 0.0002, + "loss": 0.961, + "step": 4710 + }, + { + "epoch": 7.879799666110184, + "grad_norm": 1.7456961870193481, + "learning_rate": 0.0002, + "loss": 1.0079, + "step": 4720 + }, + { + "epoch": 7.896494156928213, + "grad_norm": 1.1902697086334229, + "learning_rate": 0.0002, + "loss": 0.9359, + "step": 4730 + }, + { + "epoch": 7.913188647746244, + "grad_norm": 1.5772660970687866, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 4740 + }, + { + "epoch": 7.929883138564274, + "grad_norm": 1.593420386314392, + "learning_rate": 0.0002, + "loss": 1.0633, + "step": 4750 + }, + { + "epoch": 7.946577629382304, + "grad_norm": 1.3951916694641113, + "learning_rate": 0.0002, + "loss": 1.0801, + "step": 4760 + }, + { + "epoch": 7.963272120200334, + "grad_norm": 1.2561997175216675, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 4770 + }, + { + "epoch": 7.979966611018364, + "grad_norm": 1.3175349235534668, + "learning_rate": 0.0002, + "loss": 0.9279, + "step": 4780 + }, + { + "epoch": 7.996661101836394, + "grad_norm": 1.4960309267044067, + "learning_rate": 0.0002, + "loss": 1.0438, + "step": 4790 + }, + { + "epoch": 8.0, + "eval_loss": 2.3482723236083984, + "eval_runtime": 59.782, + "eval_samples_per_second": 8.615, + "eval_steps_per_second": 1.087, + "step": 4792 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.217630302774231e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-4792/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9069b3f0faf7a0d6ca062408f91678e178838950 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f842acd9f545027c4af9b5a92503999a507d8b437e37fff055d116c3ea5b429 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..da32e5bf9c3d1e8a4faacca0aeb62396f1321646 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8722b153381d134e293faf550eab28c129904a1ab103f8879094adba722a6f05 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4b2259a39d874904f54d1e6702f3db0d5388c9b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:439e2ecd35961f0de4f3547138e47757bff0c886401e2b6e6481a481b21ec2b0 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c67dab4254a58bede6768d7bcf448423c750c39 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6ef4f5d47c7ee029c9a941f2297d6fa05a0f0cb767f35373c687526d8ff4545 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e10202e705a5c3ca40025784df0cd843c6a67721 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/trainer_state.json @@ -0,0 +1,454 @@ +{ + "best_metric": 1.8236571550369263, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 599, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01669449081803005, + "grad_norm": 0.4811326861381531, + "learning_rate": 0.0002, + "loss": 2.6298, + "step": 10 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 0.697903573513031, + "learning_rate": 0.0002, + "loss": 2.2673, + "step": 20 + }, + { + "epoch": 0.05008347245409015, + "grad_norm": 0.5622886419296265, + "learning_rate": 0.0002, + "loss": 2.0746, + "step": 30 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 0.4684421122074127, + "learning_rate": 0.0002, + "loss": 1.9808, + "step": 40 + }, + { + "epoch": 0.08347245409015025, + "grad_norm": 0.4790354371070862, + "learning_rate": 0.0002, + "loss": 1.9796, + "step": 50 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 0.5846750140190125, + "learning_rate": 0.0002, + "loss": 1.9269, + "step": 60 + }, + { + "epoch": 0.11686143572621036, + "grad_norm": 0.4034216105937958, + "learning_rate": 0.0002, + "loss": 1.9773, + "step": 70 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 0.4602500796318054, + "learning_rate": 0.0002, + "loss": 1.8688, + "step": 80 + }, + { + "epoch": 0.15025041736227046, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.0002, + "loss": 1.9703, + "step": 90 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 0.3892269730567932, + "learning_rate": 0.0002, + "loss": 1.8443, + "step": 100 + }, + { + "epoch": 0.18363939899833054, + "grad_norm": 0.40771016478538513, + "learning_rate": 0.0002, + "loss": 1.9192, + "step": 110 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 0.3820408880710602, + "learning_rate": 0.0002, + "loss": 1.8292, + "step": 120 + }, + { + "epoch": 0.21702838063439064, + "grad_norm": 0.3719843626022339, + "learning_rate": 0.0002, + "loss": 1.8313, + "step": 130 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 0.4359976351261139, + "learning_rate": 0.0002, + "loss": 1.8189, + "step": 140 + }, + { + "epoch": 0.25041736227045075, + "grad_norm": 0.3932259976863861, + "learning_rate": 0.0002, + "loss": 1.8125, + "step": 150 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 0.7001785635948181, + "learning_rate": 0.0002, + "loss": 1.8681, + "step": 160 + }, + { + "epoch": 0.2838063439065108, + "grad_norm": 0.7619664669036865, + "learning_rate": 0.0002, + "loss": 1.9328, + "step": 170 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 0.3715350329875946, + "learning_rate": 0.0002, + "loss": 1.7572, + "step": 180 + }, + { + "epoch": 0.31719532554257096, + "grad_norm": 0.5008004903793335, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 190 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 0.47509506344795227, + "learning_rate": 0.0002, + "loss": 1.8469, + "step": 200 + }, + { + "epoch": 0.35058430717863104, + "grad_norm": 0.41775935888290405, + "learning_rate": 0.0002, + "loss": 1.9042, + "step": 210 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 0.43939948081970215, + "learning_rate": 0.0002, + "loss": 1.7995, + "step": 220 + }, + { + "epoch": 0.38397328881469117, + "grad_norm": 0.5101977586746216, + "learning_rate": 0.0002, + "loss": 1.8535, + "step": 230 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 0.37367475032806396, + "learning_rate": 0.0002, + "loss": 1.8261, + "step": 240 + }, + { + "epoch": 0.41736227045075125, + "grad_norm": 0.38865089416503906, + "learning_rate": 0.0002, + "loss": 1.8361, + "step": 250 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 0.33937838673591614, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 260 + }, + { + "epoch": 0.4507512520868113, + "grad_norm": 0.41416028141975403, + "learning_rate": 0.0002, + "loss": 1.8288, + "step": 270 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 0.4010271430015564, + "learning_rate": 0.0002, + "loss": 1.8542, + "step": 280 + }, + { + "epoch": 0.48414023372287146, + "grad_norm": 0.3960907459259033, + "learning_rate": 0.0002, + "loss": 1.8803, + "step": 290 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 0.357433021068573, + "learning_rate": 0.0002, + "loss": 1.8875, + "step": 300 + }, + { + "epoch": 0.5175292153589316, + "grad_norm": 0.38190674781799316, + "learning_rate": 0.0002, + "loss": 1.8478, + "step": 310 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 0.3336802124977112, + "learning_rate": 0.0002, + "loss": 1.8142, + "step": 320 + }, + { + "epoch": 0.5509181969949917, + "grad_norm": 0.35935860872268677, + "learning_rate": 0.0002, + "loss": 1.82, + "step": 330 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 0.3950583040714264, + "learning_rate": 0.0002, + "loss": 1.854, + "step": 340 + }, + { + "epoch": 0.5843071786310517, + "grad_norm": 0.31413400173187256, + "learning_rate": 0.0002, + "loss": 1.8089, + "step": 350 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 0.3342890441417694, + "learning_rate": 0.0002, + "loss": 1.8417, + "step": 360 + }, + { + "epoch": 0.6176961602671118, + "grad_norm": 0.36961331963539124, + "learning_rate": 0.0002, + "loss": 1.766, + "step": 370 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 0.350652813911438, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 380 + }, + { + "epoch": 0.6510851419031719, + "grad_norm": 0.3588177263736725, + "learning_rate": 0.0002, + "loss": 1.7797, + "step": 390 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 0.3327147960662842, + "learning_rate": 0.0002, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 0.6844741235392321, + "grad_norm": 0.3632844388484955, + "learning_rate": 0.0002, + "loss": 1.754, + "step": 410 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 0.34581053256988525, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 420 + }, + { + "epoch": 0.7178631051752922, + "grad_norm": 0.37237727642059326, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 430 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 0.48366475105285645, + "learning_rate": 0.0002, + "loss": 1.711, + "step": 440 + }, + { + "epoch": 0.7512520868113522, + "grad_norm": 0.3512793183326721, + "learning_rate": 0.0002, + "loss": 1.8658, + "step": 450 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 0.30473145842552185, + "learning_rate": 0.0002, + "loss": 1.7687, + "step": 460 + }, + { + "epoch": 0.7846410684474123, + "grad_norm": 0.3718157112598419, + "learning_rate": 0.0002, + "loss": 1.789, + "step": 470 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 0.34506872296333313, + "learning_rate": 0.0002, + "loss": 1.8527, + "step": 480 + }, + { + "epoch": 0.8180300500834724, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0002, + "loss": 1.8163, + "step": 490 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 0.35659778118133545, + "learning_rate": 0.0002, + "loss": 1.8499, + "step": 500 + }, + { + "epoch": 0.8514190317195326, + "grad_norm": 0.3631179928779602, + "learning_rate": 0.0002, + "loss": 1.8096, + "step": 510 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 0.3252873420715332, + "learning_rate": 0.0002, + "loss": 1.8003, + "step": 520 + }, + { + "epoch": 0.8848080133555927, + "grad_norm": 0.32796111702919006, + "learning_rate": 0.0002, + "loss": 1.7831, + "step": 530 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 0.3556145131587982, + "learning_rate": 0.0002, + "loss": 1.7675, + "step": 540 + }, + { + "epoch": 0.9181969949916527, + "grad_norm": 0.33029764890670776, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 0.3531745970249176, + "learning_rate": 0.0002, + "loss": 1.7932, + "step": 560 + }, + { + "epoch": 0.9515859766277128, + "grad_norm": 0.3486989140510559, + "learning_rate": 0.0002, + "loss": 1.7982, + "step": 570 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 0.34676939249038696, + "learning_rate": 0.0002, + "loss": 1.8522, + "step": 580 + }, + { + "epoch": 0.9849749582637729, + "grad_norm": 0.3389652669429779, + "learning_rate": 0.0002, + "loss": 1.7805, + "step": 590 + }, + { + "epoch": 1.0, + "eval_loss": 1.8236571550369263, + "eval_runtime": 77.157, + "eval_samples_per_second": 6.675, + "eval_steps_per_second": 0.842, + "step": 599 + } + ], + "logging_steps": 10, + "max_steps": 4792, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.772037878467789e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..64ca22b17ce8b460e2033540a408d9926ab1e69d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:790482b172d64d1d94b484408c50783310f7b01be29f8cf82f809f2acd1ec121 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0321287d4ba9c2ca6778d76460ebc9c71573b965 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 599, "epoch_duration": 1328.1681866645813, "total_accumulated_duration": 1328.1681866645813, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}]} +{"epoch": 2.0, "step": 1198, "epoch_duration": 1468.0721940994263, "total_accumulated_duration": 2796.2403807640076, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-599", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}, {"eval_loss": 1.8236571550369263, "eval_runtime": 77.157, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.842, "epoch": 1.0, "step": 599}, {"loss": 1.7724, "grad_norm": 0.6550514698028564, "learning_rate": 0.0002, "epoch": 1.001669449081803, "step": 600}, {"loss": 1.7601, "grad_norm": 0.36824166774749756, "learning_rate": 0.0002, "epoch": 1.018363939899833, "step": 610}, {"loss": 1.7684, "grad_norm": 0.34707099199295044, "learning_rate": 0.0002, "epoch": 1.0350584307178632, "step": 620}, {"loss": 1.7552, "grad_norm": 0.38599663972854614, "learning_rate": 0.0002, "epoch": 1.0517529215358932, "step": 630}, {"loss": 1.7396, "grad_norm": 0.34381693601608276, "learning_rate": 0.0002, "epoch": 1.0684474123539232, "step": 640}, {"loss": 1.7621, "grad_norm": 0.3657481372356415, "learning_rate": 0.0002, "epoch": 1.0851419031719534, "step": 650}, {"loss": 1.723, "grad_norm": 0.3310803771018982, "learning_rate": 0.0002, "epoch": 1.1018363939899833, "step": 660}, {"loss": 1.7788, "grad_norm": 0.37122875452041626, "learning_rate": 0.0002, "epoch": 1.1185308848080133, "step": 670}, {"loss": 1.6887, "grad_norm": 0.3976633548736572, "learning_rate": 0.0002, "epoch": 1.1352253756260433, "step": 680}, {"loss": 1.7398, "grad_norm": 0.37567615509033203, "learning_rate": 0.0002, "epoch": 1.1519198664440735, "step": 690}, {"loss": 1.7326, "grad_norm": 0.3683645725250244, "learning_rate": 0.0002, "epoch": 1.1686143572621035, "step": 700}, {"loss": 1.8074, "grad_norm": 0.3862009644508362, "learning_rate": 0.0002, "epoch": 1.1853088480801335, "step": 710}, {"loss": 1.7993, "grad_norm": 0.3478439450263977, "learning_rate": 0.0002, "epoch": 1.2020033388981637, "step": 720}, {"loss": 1.7945, "grad_norm": 0.3694932162761688, "learning_rate": 0.0002, "epoch": 1.2186978297161937, "step": 730}, {"loss": 1.7269, "grad_norm": 0.3661787211894989, "learning_rate": 0.0002, "epoch": 1.2353923205342237, "step": 740}, {"loss": 1.7561, "grad_norm": 0.372951865196228, "learning_rate": 0.0002, "epoch": 1.2520868113522536, "step": 750}, {"loss": 1.7507, "grad_norm": 0.38718998432159424, "learning_rate": 0.0002, "epoch": 1.2687813021702838, "step": 760}, {"loss": 1.7331, "grad_norm": 0.37488260865211487, "learning_rate": 0.0002, "epoch": 1.2854757929883138, "step": 770}, {"loss": 1.7548, "grad_norm": 0.34794917702674866, "learning_rate": 0.0002, "epoch": 1.302170283806344, "step": 780}, {"loss": 1.7592, "grad_norm": 0.3627476990222931, "learning_rate": 0.0002, "epoch": 1.318864774624374, "step": 790}, {"loss": 1.6837, "grad_norm": 0.3773096799850464, "learning_rate": 0.0002, "epoch": 1.335559265442404, "step": 800}, {"loss": 1.7448, "grad_norm": 0.36476725339889526, "learning_rate": 0.0002, "epoch": 1.352253756260434, "step": 810}, {"loss": 1.7958, "grad_norm": 0.3767942190170288, "learning_rate": 0.0002, "epoch": 1.3689482470784642, "step": 820}, {"loss": 1.7241, "grad_norm": 0.3502795398235321, "learning_rate": 0.0002, "epoch": 1.3856427378964942, "step": 830}, {"loss": 1.736, "grad_norm": 0.4008622169494629, "learning_rate": 0.0002, "epoch": 1.4023372287145242, "step": 840}, {"loss": 1.7211, "grad_norm": 0.4029707610607147, "learning_rate": 0.0002, "epoch": 1.4190317195325544, "step": 850}, {"loss": 1.712, "grad_norm": 0.41480565071105957, "learning_rate": 0.0002, "epoch": 1.4357262103505843, "step": 860}, {"loss": 1.7287, "grad_norm": 0.4351646900177002, "learning_rate": 0.0002, "epoch": 1.4524207011686143, "step": 870}, {"loss": 1.8436, "grad_norm": 0.4053232967853546, "learning_rate": 0.0002, "epoch": 1.4691151919866443, "step": 880}, {"loss": 1.7669, "grad_norm": 0.3515186607837677, "learning_rate": 0.0002, "epoch": 1.4858096828046745, "step": 890}, {"loss": 1.698, "grad_norm": 0.42895469069480896, "learning_rate": 0.0002, "epoch": 1.5025041736227045, "step": 900}, {"loss": 1.7064, "grad_norm": 0.40897831320762634, "learning_rate": 0.0002, "epoch": 1.5191986644407347, "step": 910}, {"loss": 1.7539, "grad_norm": 0.3544739782810211, "learning_rate": 0.0002, "epoch": 1.5358931552587647, "step": 920}, {"loss": 1.7596, "grad_norm": 0.3848305642604828, "learning_rate": 0.0002, "epoch": 1.5525876460767947, "step": 930}, {"loss": 1.7344, "grad_norm": 0.36952173709869385, "learning_rate": 0.0002, "epoch": 1.5692821368948247, "step": 940}, {"loss": 1.7329, "grad_norm": 0.36505743861198425, "learning_rate": 0.0002, "epoch": 1.5859766277128546, "step": 950}, {"loss": 1.6828, "grad_norm": 0.3707764446735382, "learning_rate": 0.0002, "epoch": 1.6026711185308848, "step": 960}, {"loss": 1.7465, "grad_norm": 0.35995468497276306, "learning_rate": 0.0002, "epoch": 1.6193656093489148, "step": 970}, {"loss": 1.7537, "grad_norm": 0.35458096861839294, "learning_rate": 0.0002, "epoch": 1.636060100166945, "step": 980}, {"loss": 1.7075, "grad_norm": 0.3557756841182709, "learning_rate": 0.0002, "epoch": 1.652754590984975, "step": 990}, {"loss": 1.7307, "grad_norm": 0.355899453163147, "learning_rate": 0.0002, "epoch": 1.669449081803005, "step": 1000}, {"loss": 1.7701, "grad_norm": 0.3709148168563843, "learning_rate": 0.0002, "epoch": 1.686143572621035, "step": 1010}, {"loss": 1.7637, "grad_norm": 0.3731614947319031, "learning_rate": 0.0002, "epoch": 1.702838063439065, "step": 1020}, {"loss": 1.741, "grad_norm": 0.3639261722564697, "learning_rate": 0.0002, "epoch": 1.7195325542570952, "step": 1030}, {"loss": 1.727, "grad_norm": 0.36371079087257385, "learning_rate": 0.0002, "epoch": 1.7362270450751254, "step": 1040}, {"loss": 1.7275, "grad_norm": 0.38235539197921753, "learning_rate": 0.0002, "epoch": 1.7529215358931554, "step": 1050}, {"loss": 1.7304, "grad_norm": 0.4109364151954651, "learning_rate": 0.0002, "epoch": 1.7696160267111853, "step": 1060}, {"loss": 1.7244, "grad_norm": 0.3499647378921509, "learning_rate": 0.0002, "epoch": 1.7863105175292153, "step": 1070}, {"loss": 1.7117, "grad_norm": 0.3892260193824768, "learning_rate": 0.0002, "epoch": 1.8030050083472453, "step": 1080}, {"loss": 1.7764, "grad_norm": 0.3545094132423401, "learning_rate": 0.0002, "epoch": 1.8196994991652755, "step": 1090}, {"loss": 1.87, "grad_norm": 0.40419837832450867, "learning_rate": 0.0002, "epoch": 1.8363939899833055, "step": 1100}, {"loss": 1.6783, "grad_norm": 0.38423678278923035, "learning_rate": 0.0002, "epoch": 1.8530884808013357, "step": 1110}, {"loss": 1.7714, "grad_norm": 0.378408282995224, "learning_rate": 0.0002, "epoch": 1.8697829716193657, "step": 1120}, {"loss": 1.7461, "grad_norm": 0.4071941077709198, "learning_rate": 0.0002, "epoch": 1.8864774624373957, "step": 1130}, {"loss": 1.7281, "grad_norm": 0.42363739013671875, "learning_rate": 0.0002, "epoch": 1.9031719532554257, "step": 1140}, {"loss": 1.819, "grad_norm": 0.37373560667037964, "learning_rate": 0.0002, "epoch": 1.9198664440734556, "step": 1150}, {"loss": 1.7847, "grad_norm": 0.36408767104148865, "learning_rate": 0.0002, "epoch": 1.9365609348914858, "step": 1160}, {"loss": 1.6725, "grad_norm": 0.3795453906059265, "learning_rate": 0.0002, "epoch": 1.9532554257095158, "step": 1170}, {"loss": 1.7726, "grad_norm": 0.34415504336357117, "learning_rate": 0.0002, "epoch": 1.969949916527546, "step": 1180}, {"loss": 1.6199, "grad_norm": 0.3491021394729614, "learning_rate": 0.0002, "epoch": 1.986644407345576, "step": 1190}]} +{"epoch": 3.0, "step": 1797, "epoch_duration": 1533.548261642456, "total_accumulated_duration": 4329.788642406464, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}, {"eval_loss": 1.8236571550369263, "eval_runtime": 77.157, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.842, "epoch": 1.0, "step": 599}, {"loss": 1.7724, "grad_norm": 0.6550514698028564, "learning_rate": 0.0002, "epoch": 1.001669449081803, "step": 600}, {"loss": 1.7601, "grad_norm": 0.36824166774749756, "learning_rate": 0.0002, "epoch": 1.018363939899833, "step": 610}, {"loss": 1.7684, "grad_norm": 0.34707099199295044, "learning_rate": 0.0002, "epoch": 1.0350584307178632, "step": 620}, {"loss": 1.7552, "grad_norm": 0.38599663972854614, "learning_rate": 0.0002, "epoch": 1.0517529215358932, "step": 630}, {"loss": 1.7396, "grad_norm": 0.34381693601608276, "learning_rate": 0.0002, "epoch": 1.0684474123539232, "step": 640}, {"loss": 1.7621, "grad_norm": 0.3657481372356415, "learning_rate": 0.0002, "epoch": 1.0851419031719534, "step": 650}, {"loss": 1.723, "grad_norm": 0.3310803771018982, "learning_rate": 0.0002, "epoch": 1.1018363939899833, "step": 660}, {"loss": 1.7788, "grad_norm": 0.37122875452041626, "learning_rate": 0.0002, "epoch": 1.1185308848080133, "step": 670}, {"loss": 1.6887, "grad_norm": 0.3976633548736572, "learning_rate": 0.0002, "epoch": 1.1352253756260433, "step": 680}, {"loss": 1.7398, "grad_norm": 0.37567615509033203, "learning_rate": 0.0002, "epoch": 1.1519198664440735, "step": 690}, {"loss": 1.7326, "grad_norm": 0.3683645725250244, "learning_rate": 0.0002, "epoch": 1.1686143572621035, "step": 700}, {"loss": 1.8074, "grad_norm": 0.3862009644508362, "learning_rate": 0.0002, "epoch": 1.1853088480801335, "step": 710}, {"loss": 1.7993, "grad_norm": 0.3478439450263977, "learning_rate": 0.0002, "epoch": 1.2020033388981637, "step": 720}, {"loss": 1.7945, "grad_norm": 0.3694932162761688, "learning_rate": 0.0002, "epoch": 1.2186978297161937, "step": 730}, {"loss": 1.7269, "grad_norm": 0.3661787211894989, "learning_rate": 0.0002, "epoch": 1.2353923205342237, "step": 740}, {"loss": 1.7561, "grad_norm": 0.372951865196228, "learning_rate": 0.0002, "epoch": 1.2520868113522536, "step": 750}, {"loss": 1.7507, "grad_norm": 0.38718998432159424, "learning_rate": 0.0002, "epoch": 1.2687813021702838, "step": 760}, {"loss": 1.7331, "grad_norm": 0.37488260865211487, "learning_rate": 0.0002, "epoch": 1.2854757929883138, "step": 770}, {"loss": 1.7548, "grad_norm": 0.34794917702674866, "learning_rate": 0.0002, "epoch": 1.302170283806344, "step": 780}, {"loss": 1.7592, "grad_norm": 0.3627476990222931, "learning_rate": 0.0002, "epoch": 1.318864774624374, "step": 790}, {"loss": 1.6837, "grad_norm": 0.3773096799850464, "learning_rate": 0.0002, "epoch": 1.335559265442404, "step": 800}, {"loss": 1.7448, "grad_norm": 0.36476725339889526, "learning_rate": 0.0002, "epoch": 1.352253756260434, "step": 810}, {"loss": 1.7958, "grad_norm": 0.3767942190170288, "learning_rate": 0.0002, "epoch": 1.3689482470784642, "step": 820}, {"loss": 1.7241, "grad_norm": 0.3502795398235321, "learning_rate": 0.0002, "epoch": 1.3856427378964942, "step": 830}, {"loss": 1.736, "grad_norm": 0.4008622169494629, "learning_rate": 0.0002, "epoch": 1.4023372287145242, "step": 840}, {"loss": 1.7211, "grad_norm": 0.4029707610607147, "learning_rate": 0.0002, "epoch": 1.4190317195325544, "step": 850}, {"loss": 1.712, "grad_norm": 0.41480565071105957, "learning_rate": 0.0002, "epoch": 1.4357262103505843, "step": 860}, {"loss": 1.7287, "grad_norm": 0.4351646900177002, "learning_rate": 0.0002, "epoch": 1.4524207011686143, "step": 870}, {"loss": 1.8436, "grad_norm": 0.4053232967853546, "learning_rate": 0.0002, "epoch": 1.4691151919866443, "step": 880}, {"loss": 1.7669, "grad_norm": 0.3515186607837677, "learning_rate": 0.0002, "epoch": 1.4858096828046745, "step": 890}, {"loss": 1.698, "grad_norm": 0.42895469069480896, "learning_rate": 0.0002, "epoch": 1.5025041736227045, "step": 900}, {"loss": 1.7064, "grad_norm": 0.40897831320762634, "learning_rate": 0.0002, "epoch": 1.5191986644407347, "step": 910}, {"loss": 1.7539, "grad_norm": 0.3544739782810211, "learning_rate": 0.0002, "epoch": 1.5358931552587647, "step": 920}, {"loss": 1.7596, "grad_norm": 0.3848305642604828, "learning_rate": 0.0002, "epoch": 1.5525876460767947, "step": 930}, {"loss": 1.7344, "grad_norm": 0.36952173709869385, "learning_rate": 0.0002, "epoch": 1.5692821368948247, "step": 940}, {"loss": 1.7329, "grad_norm": 0.36505743861198425, "learning_rate": 0.0002, "epoch": 1.5859766277128546, "step": 950}, {"loss": 1.6828, "grad_norm": 0.3707764446735382, "learning_rate": 0.0002, "epoch": 1.6026711185308848, "step": 960}, {"loss": 1.7465, "grad_norm": 0.35995468497276306, "learning_rate": 0.0002, "epoch": 1.6193656093489148, "step": 970}, {"loss": 1.7537, "grad_norm": 0.35458096861839294, "learning_rate": 0.0002, "epoch": 1.636060100166945, "step": 980}, {"loss": 1.7075, "grad_norm": 0.3557756841182709, "learning_rate": 0.0002, "epoch": 1.652754590984975, "step": 990}, {"loss": 1.7307, "grad_norm": 0.355899453163147, "learning_rate": 0.0002, "epoch": 1.669449081803005, "step": 1000}, {"loss": 1.7701, "grad_norm": 0.3709148168563843, "learning_rate": 0.0002, "epoch": 1.686143572621035, "step": 1010}, {"loss": 1.7637, "grad_norm": 0.3731614947319031, "learning_rate": 0.0002, "epoch": 1.702838063439065, "step": 1020}, {"loss": 1.741, "grad_norm": 0.3639261722564697, "learning_rate": 0.0002, "epoch": 1.7195325542570952, "step": 1030}, {"loss": 1.727, "grad_norm": 0.36371079087257385, "learning_rate": 0.0002, "epoch": 1.7362270450751254, "step": 1040}, {"loss": 1.7275, "grad_norm": 0.38235539197921753, "learning_rate": 0.0002, "epoch": 1.7529215358931554, "step": 1050}, {"loss": 1.7304, "grad_norm": 0.4109364151954651, "learning_rate": 0.0002, "epoch": 1.7696160267111853, "step": 1060}, {"loss": 1.7244, "grad_norm": 0.3499647378921509, "learning_rate": 0.0002, "epoch": 1.7863105175292153, "step": 1070}, {"loss": 1.7117, "grad_norm": 0.3892260193824768, "learning_rate": 0.0002, "epoch": 1.8030050083472453, "step": 1080}, {"loss": 1.7764, "grad_norm": 0.3545094132423401, "learning_rate": 0.0002, "epoch": 1.8196994991652755, "step": 1090}, {"loss": 1.87, "grad_norm": 0.40419837832450867, "learning_rate": 0.0002, "epoch": 1.8363939899833055, "step": 1100}, {"loss": 1.6783, "grad_norm": 0.38423678278923035, "learning_rate": 0.0002, "epoch": 1.8530884808013357, "step": 1110}, {"loss": 1.7714, "grad_norm": 0.378408282995224, "learning_rate": 0.0002, "epoch": 1.8697829716193657, "step": 1120}, {"loss": 1.7461, "grad_norm": 0.4071941077709198, "learning_rate": 0.0002, "epoch": 1.8864774624373957, "step": 1130}, {"loss": 1.7281, "grad_norm": 0.42363739013671875, "learning_rate": 0.0002, "epoch": 1.9031719532554257, "step": 1140}, {"loss": 1.819, "grad_norm": 0.37373560667037964, "learning_rate": 0.0002, "epoch": 1.9198664440734556, "step": 1150}, {"loss": 1.7847, "grad_norm": 0.36408767104148865, "learning_rate": 0.0002, "epoch": 1.9365609348914858, "step": 1160}, {"loss": 1.6725, "grad_norm": 0.3795453906059265, "learning_rate": 0.0002, "epoch": 1.9532554257095158, "step": 1170}, {"loss": 1.7726, "grad_norm": 0.34415504336357117, "learning_rate": 0.0002, "epoch": 1.969949916527546, "step": 1180}, {"loss": 1.6199, "grad_norm": 0.3491021394729614, "learning_rate": 0.0002, "epoch": 1.986644407345576, "step": 1190}, {"eval_loss": 1.8182536363601685, "eval_runtime": 87.8767, "eval_samples_per_second": 5.86, "eval_steps_per_second": 0.74, "epoch": 2.0, "step": 1198}, {"loss": 1.5857, "grad_norm": 0.36758512258529663, "learning_rate": 0.0002, "epoch": 2.003338898163606, "step": 1200}, {"loss": 1.61, "grad_norm": 0.36278557777404785, "learning_rate": 0.0002, "epoch": 2.020033388981636, "step": 1210}, {"loss": 1.6733, "grad_norm": 0.4186977744102478, "learning_rate": 0.0002, "epoch": 2.036727879799666, "step": 1220}, {"loss": 1.6977, "grad_norm": 0.3958706855773926, "learning_rate": 0.0002, "epoch": 2.053422370617696, "step": 1230}, {"loss": 1.7054, "grad_norm": 0.43305638432502747, "learning_rate": 0.0002, "epoch": 2.0701168614357264, "step": 1240}, {"loss": 1.6275, "grad_norm": 0.4509678781032562, "learning_rate": 0.0002, "epoch": 2.0868113522537564, "step": 1250}, {"loss": 1.6814, "grad_norm": 0.4297264516353607, "learning_rate": 0.0002, "epoch": 2.1035058430717863, "step": 1260}, {"loss": 1.6121, "grad_norm": 0.4579504132270813, "learning_rate": 0.0002, "epoch": 2.1202003338898163, "step": 1270}, {"loss": 1.6207, "grad_norm": 0.4223267138004303, "learning_rate": 0.0002, "epoch": 2.1368948247078463, "step": 1280}, {"loss": 1.6472, "grad_norm": 0.41538703441619873, "learning_rate": 0.0002, "epoch": 2.1535893155258763, "step": 1290}, {"loss": 1.6274, "grad_norm": 0.4987374544143677, "learning_rate": 0.0002, "epoch": 2.1702838063439067, "step": 1300}, {"loss": 1.6555, "grad_norm": 0.45300114154815674, "learning_rate": 0.0002, "epoch": 2.1869782971619367, "step": 1310}, {"loss": 1.6486, "grad_norm": 0.4577588737010956, "learning_rate": 0.0002, "epoch": 2.2036727879799667, "step": 1320}, {"loss": 1.644, "grad_norm": 0.4110747277736664, "learning_rate": 0.0002, "epoch": 2.2203672787979967, "step": 1330}, {"loss": 1.5875, "grad_norm": 0.5107163190841675, "learning_rate": 0.0002, "epoch": 2.2370617696160267, "step": 1340}, {"loss": 1.691, "grad_norm": 0.41190820932388306, "learning_rate": 0.0002, "epoch": 2.2537562604340566, "step": 1350}, {"loss": 1.5745, "grad_norm": 0.47458386421203613, "learning_rate": 0.0002, "epoch": 2.2704507512520866, "step": 1360}, {"loss": 1.6964, "grad_norm": 0.42136940360069275, "learning_rate": 0.0002, "epoch": 2.287145242070117, "step": 1370}, {"loss": 1.657, "grad_norm": 0.48292383551597595, "learning_rate": 0.0002, "epoch": 2.303839732888147, "step": 1380}, {"loss": 1.6816, "grad_norm": 0.4519229531288147, "learning_rate": 0.0002, "epoch": 2.320534223706177, "step": 1390}, {"loss": 1.6408, "grad_norm": 0.5860922336578369, "learning_rate": 0.0002, "epoch": 2.337228714524207, "step": 1400}, {"loss": 1.5551, "grad_norm": 0.4362313747406006, "learning_rate": 0.0002, "epoch": 2.353923205342237, "step": 1410}, {"loss": 1.6763, "grad_norm": 0.46916621923446655, "learning_rate": 0.0002, "epoch": 2.370617696160267, "step": 1420}, {"loss": 1.6082, "grad_norm": 0.5249663591384888, "learning_rate": 0.0002, "epoch": 2.3873121869782974, "step": 1430}, {"loss": 1.6793, "grad_norm": 0.4764375388622284, "learning_rate": 0.0002, "epoch": 2.4040066777963274, "step": 1440}, {"loss": 1.6395, "grad_norm": 0.46573784947395325, "learning_rate": 0.0002, "epoch": 2.4207011686143574, "step": 1450}, {"loss": 1.6629, "grad_norm": 0.44539371132850647, "learning_rate": 0.0002, "epoch": 2.4373956594323873, "step": 1460}, {"loss": 1.6149, "grad_norm": 0.40925896167755127, "learning_rate": 0.0002, "epoch": 2.4540901502504173, "step": 1470}, {"loss": 1.6213, "grad_norm": 0.4431462287902832, "learning_rate": 0.0002, "epoch": 2.4707846410684473, "step": 1480}, {"loss": 1.692, "grad_norm": 0.5476022362709045, "learning_rate": 0.0002, "epoch": 2.4874791318864773, "step": 1490}, {"loss": 1.5875, "grad_norm": 0.44762539863586426, "learning_rate": 0.0002, "epoch": 2.5041736227045073, "step": 1500}, {"loss": 1.6811, "grad_norm": 0.5470041632652283, "learning_rate": 0.0002, "epoch": 2.5208681135225377, "step": 1510}, {"loss": 1.7411, "grad_norm": 0.4739997088909149, "learning_rate": 0.0002, "epoch": 2.5375626043405677, "step": 1520}, {"loss": 1.5975, "grad_norm": 0.47115322947502136, "learning_rate": 0.0002, "epoch": 2.5542570951585977, "step": 1530}, {"loss": 1.6828, "grad_norm": 0.49705708026885986, "learning_rate": 0.0002, "epoch": 2.5709515859766277, "step": 1540}, {"loss": 1.7135, "grad_norm": 0.5537301301956177, "learning_rate": 0.0002, "epoch": 2.5876460767946576, "step": 1550}, {"loss": 1.5753, "grad_norm": 0.46930626034736633, "learning_rate": 0.0002, "epoch": 2.604340567612688, "step": 1560}, {"loss": 1.6706, "grad_norm": 0.42371469736099243, "learning_rate": 0.0002, "epoch": 2.621035058430718, "step": 1570}, {"loss": 1.6193, "grad_norm": 0.49005603790283203, "learning_rate": 0.0002, "epoch": 2.637729549248748, "step": 1580}, {"loss": 1.7138, "grad_norm": 0.4646829068660736, "learning_rate": 0.0002, "epoch": 2.654424040066778, "step": 1590}, {"loss": 1.6806, "grad_norm": 0.5091238617897034, "learning_rate": 0.0002, "epoch": 2.671118530884808, "step": 1600}, {"loss": 1.65, "grad_norm": 0.4889985918998718, "learning_rate": 0.0002, "epoch": 2.687813021702838, "step": 1610}, {"loss": 1.6573, "grad_norm": 0.5128234624862671, "learning_rate": 0.0002, "epoch": 2.704507512520868, "step": 1620}, {"loss": 1.6739, "grad_norm": 0.46999186277389526, "learning_rate": 0.0002, "epoch": 2.721202003338898, "step": 1630}, {"loss": 1.6277, "grad_norm": 0.4949921667575836, "learning_rate": 0.0002, "epoch": 2.7378964941569284, "step": 1640}, {"loss": 1.5958, "grad_norm": 0.4484370946884155, "learning_rate": 0.0002, "epoch": 2.7545909849749584, "step": 1650}, {"loss": 1.6156, "grad_norm": 0.45599570870399475, "learning_rate": 0.0002, "epoch": 2.7712854757929883, "step": 1660}, {"loss": 1.6037, "grad_norm": 0.5093285441398621, "learning_rate": 0.0002, "epoch": 2.7879799666110183, "step": 1670}, {"loss": 1.6675, "grad_norm": 0.44737935066223145, "learning_rate": 0.0002, "epoch": 2.8046744574290483, "step": 1680}, {"loss": 1.6429, "grad_norm": 0.4374251365661621, "learning_rate": 0.0002, "epoch": 2.8213689482470787, "step": 1690}, {"loss": 1.5941, "grad_norm": 0.44765740633010864, "learning_rate": 0.0002, "epoch": 2.8380634390651087, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.44685253500938416, "learning_rate": 0.0002, "epoch": 2.8547579298831387, "step": 1710}, {"loss": 1.6301, "grad_norm": 0.44777143001556396, "learning_rate": 0.0002, "epoch": 2.8714524207011687, "step": 1720}, {"loss": 1.6473, "grad_norm": 0.4178132712841034, "learning_rate": 0.0002, "epoch": 2.8881469115191987, "step": 1730}, {"loss": 1.6491, "grad_norm": 0.4487852156162262, "learning_rate": 0.0002, "epoch": 2.9048414023372287, "step": 1740}, {"loss": 1.6616, "grad_norm": 0.47137337923049927, "learning_rate": 0.0002, "epoch": 2.9215358931552586, "step": 1750}, {"loss": 1.6767, "grad_norm": 0.48543235659599304, "learning_rate": 0.0002, "epoch": 2.9382303839732886, "step": 1760}, {"loss": 1.7055, "grad_norm": 0.4174182116985321, "learning_rate": 0.0002, "epoch": 2.9549248747913186, "step": 1770}, {"loss": 1.5767, "grad_norm": 0.43385711312294006, "learning_rate": 0.0002, "epoch": 2.971619365609349, "step": 1780}, {"loss": 1.6215, "grad_norm": 0.474332332611084, "learning_rate": 0.0002, "epoch": 2.988313856427379, "step": 1790}]} +{"epoch": 4.0, "step": 2396, "epoch_duration": 1466.7295718193054, "total_accumulated_duration": 5796.518214225769, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}, {"eval_loss": 1.8236571550369263, "eval_runtime": 77.157, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.842, "epoch": 1.0, "step": 599}, {"loss": 1.7724, "grad_norm": 0.6550514698028564, "learning_rate": 0.0002, "epoch": 1.001669449081803, "step": 600}, {"loss": 1.7601, "grad_norm": 0.36824166774749756, "learning_rate": 0.0002, "epoch": 1.018363939899833, "step": 610}, {"loss": 1.7684, "grad_norm": 0.34707099199295044, "learning_rate": 0.0002, "epoch": 1.0350584307178632, "step": 620}, {"loss": 1.7552, "grad_norm": 0.38599663972854614, "learning_rate": 0.0002, "epoch": 1.0517529215358932, "step": 630}, {"loss": 1.7396, "grad_norm": 0.34381693601608276, "learning_rate": 0.0002, "epoch": 1.0684474123539232, "step": 640}, {"loss": 1.7621, "grad_norm": 0.3657481372356415, "learning_rate": 0.0002, "epoch": 1.0851419031719534, "step": 650}, {"loss": 1.723, "grad_norm": 0.3310803771018982, "learning_rate": 0.0002, "epoch": 1.1018363939899833, "step": 660}, {"loss": 1.7788, "grad_norm": 0.37122875452041626, "learning_rate": 0.0002, "epoch": 1.1185308848080133, "step": 670}, {"loss": 1.6887, "grad_norm": 0.3976633548736572, "learning_rate": 0.0002, "epoch": 1.1352253756260433, "step": 680}, {"loss": 1.7398, "grad_norm": 0.37567615509033203, "learning_rate": 0.0002, "epoch": 1.1519198664440735, "step": 690}, {"loss": 1.7326, "grad_norm": 0.3683645725250244, "learning_rate": 0.0002, "epoch": 1.1686143572621035, "step": 700}, {"loss": 1.8074, "grad_norm": 0.3862009644508362, "learning_rate": 0.0002, "epoch": 1.1853088480801335, "step": 710}, {"loss": 1.7993, "grad_norm": 0.3478439450263977, "learning_rate": 0.0002, "epoch": 1.2020033388981637, "step": 720}, {"loss": 1.7945, "grad_norm": 0.3694932162761688, "learning_rate": 0.0002, "epoch": 1.2186978297161937, "step": 730}, {"loss": 1.7269, "grad_norm": 0.3661787211894989, "learning_rate": 0.0002, "epoch": 1.2353923205342237, "step": 740}, {"loss": 1.7561, "grad_norm": 0.372951865196228, "learning_rate": 0.0002, "epoch": 1.2520868113522536, "step": 750}, {"loss": 1.7507, "grad_norm": 0.38718998432159424, "learning_rate": 0.0002, "epoch": 1.2687813021702838, "step": 760}, {"loss": 1.7331, "grad_norm": 0.37488260865211487, "learning_rate": 0.0002, "epoch": 1.2854757929883138, "step": 770}, {"loss": 1.7548, "grad_norm": 0.34794917702674866, "learning_rate": 0.0002, "epoch": 1.302170283806344, "step": 780}, {"loss": 1.7592, "grad_norm": 0.3627476990222931, "learning_rate": 0.0002, "epoch": 1.318864774624374, "step": 790}, {"loss": 1.6837, "grad_norm": 0.3773096799850464, "learning_rate": 0.0002, "epoch": 1.335559265442404, "step": 800}, {"loss": 1.7448, "grad_norm": 0.36476725339889526, "learning_rate": 0.0002, "epoch": 1.352253756260434, "step": 810}, {"loss": 1.7958, "grad_norm": 0.3767942190170288, "learning_rate": 0.0002, "epoch": 1.3689482470784642, "step": 820}, {"loss": 1.7241, "grad_norm": 0.3502795398235321, "learning_rate": 0.0002, "epoch": 1.3856427378964942, "step": 830}, {"loss": 1.736, "grad_norm": 0.4008622169494629, "learning_rate": 0.0002, "epoch": 1.4023372287145242, "step": 840}, {"loss": 1.7211, "grad_norm": 0.4029707610607147, "learning_rate": 0.0002, "epoch": 1.4190317195325544, "step": 850}, {"loss": 1.712, "grad_norm": 0.41480565071105957, "learning_rate": 0.0002, "epoch": 1.4357262103505843, "step": 860}, {"loss": 1.7287, "grad_norm": 0.4351646900177002, "learning_rate": 0.0002, "epoch": 1.4524207011686143, "step": 870}, {"loss": 1.8436, "grad_norm": 0.4053232967853546, "learning_rate": 0.0002, "epoch": 1.4691151919866443, "step": 880}, {"loss": 1.7669, "grad_norm": 0.3515186607837677, "learning_rate": 0.0002, "epoch": 1.4858096828046745, "step": 890}, {"loss": 1.698, "grad_norm": 0.42895469069480896, "learning_rate": 0.0002, "epoch": 1.5025041736227045, "step": 900}, {"loss": 1.7064, "grad_norm": 0.40897831320762634, "learning_rate": 0.0002, "epoch": 1.5191986644407347, "step": 910}, {"loss": 1.7539, "grad_norm": 0.3544739782810211, "learning_rate": 0.0002, "epoch": 1.5358931552587647, "step": 920}, {"loss": 1.7596, "grad_norm": 0.3848305642604828, "learning_rate": 0.0002, "epoch": 1.5525876460767947, "step": 930}, {"loss": 1.7344, "grad_norm": 0.36952173709869385, "learning_rate": 0.0002, "epoch": 1.5692821368948247, "step": 940}, {"loss": 1.7329, "grad_norm": 0.36505743861198425, "learning_rate": 0.0002, "epoch": 1.5859766277128546, "step": 950}, {"loss": 1.6828, "grad_norm": 0.3707764446735382, "learning_rate": 0.0002, "epoch": 1.6026711185308848, "step": 960}, {"loss": 1.7465, "grad_norm": 0.35995468497276306, "learning_rate": 0.0002, "epoch": 1.6193656093489148, "step": 970}, {"loss": 1.7537, "grad_norm": 0.35458096861839294, "learning_rate": 0.0002, "epoch": 1.636060100166945, "step": 980}, {"loss": 1.7075, "grad_norm": 0.3557756841182709, "learning_rate": 0.0002, "epoch": 1.652754590984975, "step": 990}, {"loss": 1.7307, "grad_norm": 0.355899453163147, "learning_rate": 0.0002, "epoch": 1.669449081803005, "step": 1000}, {"loss": 1.7701, "grad_norm": 0.3709148168563843, "learning_rate": 0.0002, "epoch": 1.686143572621035, "step": 1010}, {"loss": 1.7637, "grad_norm": 0.3731614947319031, "learning_rate": 0.0002, "epoch": 1.702838063439065, "step": 1020}, {"loss": 1.741, "grad_norm": 0.3639261722564697, "learning_rate": 0.0002, "epoch": 1.7195325542570952, "step": 1030}, {"loss": 1.727, "grad_norm": 0.36371079087257385, "learning_rate": 0.0002, "epoch": 1.7362270450751254, "step": 1040}, {"loss": 1.7275, "grad_norm": 0.38235539197921753, "learning_rate": 0.0002, "epoch": 1.7529215358931554, "step": 1050}, {"loss": 1.7304, "grad_norm": 0.4109364151954651, "learning_rate": 0.0002, "epoch": 1.7696160267111853, "step": 1060}, {"loss": 1.7244, "grad_norm": 0.3499647378921509, "learning_rate": 0.0002, "epoch": 1.7863105175292153, "step": 1070}, {"loss": 1.7117, "grad_norm": 0.3892260193824768, "learning_rate": 0.0002, "epoch": 1.8030050083472453, "step": 1080}, {"loss": 1.7764, "grad_norm": 0.3545094132423401, "learning_rate": 0.0002, "epoch": 1.8196994991652755, "step": 1090}, {"loss": 1.87, "grad_norm": 0.40419837832450867, "learning_rate": 0.0002, "epoch": 1.8363939899833055, "step": 1100}, {"loss": 1.6783, "grad_norm": 0.38423678278923035, "learning_rate": 0.0002, "epoch": 1.8530884808013357, "step": 1110}, {"loss": 1.7714, "grad_norm": 0.378408282995224, "learning_rate": 0.0002, "epoch": 1.8697829716193657, "step": 1120}, {"loss": 1.7461, "grad_norm": 0.4071941077709198, "learning_rate": 0.0002, "epoch": 1.8864774624373957, "step": 1130}, {"loss": 1.7281, "grad_norm": 0.42363739013671875, "learning_rate": 0.0002, "epoch": 1.9031719532554257, "step": 1140}, {"loss": 1.819, "grad_norm": 0.37373560667037964, "learning_rate": 0.0002, "epoch": 1.9198664440734556, "step": 1150}, {"loss": 1.7847, "grad_norm": 0.36408767104148865, "learning_rate": 0.0002, "epoch": 1.9365609348914858, "step": 1160}, {"loss": 1.6725, "grad_norm": 0.3795453906059265, "learning_rate": 0.0002, "epoch": 1.9532554257095158, "step": 1170}, {"loss": 1.7726, "grad_norm": 0.34415504336357117, "learning_rate": 0.0002, "epoch": 1.969949916527546, "step": 1180}, {"loss": 1.6199, "grad_norm": 0.3491021394729614, "learning_rate": 0.0002, "epoch": 1.986644407345576, "step": 1190}, {"eval_loss": 1.8182536363601685, "eval_runtime": 87.8767, "eval_samples_per_second": 5.86, "eval_steps_per_second": 0.74, "epoch": 2.0, "step": 1198}, {"loss": 1.5857, "grad_norm": 0.36758512258529663, "learning_rate": 0.0002, "epoch": 2.003338898163606, "step": 1200}, {"loss": 1.61, "grad_norm": 0.36278557777404785, "learning_rate": 0.0002, "epoch": 2.020033388981636, "step": 1210}, {"loss": 1.6733, "grad_norm": 0.4186977744102478, "learning_rate": 0.0002, "epoch": 2.036727879799666, "step": 1220}, {"loss": 1.6977, "grad_norm": 0.3958706855773926, "learning_rate": 0.0002, "epoch": 2.053422370617696, "step": 1230}, {"loss": 1.7054, "grad_norm": 0.43305638432502747, "learning_rate": 0.0002, "epoch": 2.0701168614357264, "step": 1240}, {"loss": 1.6275, "grad_norm": 0.4509678781032562, "learning_rate": 0.0002, "epoch": 2.0868113522537564, "step": 1250}, {"loss": 1.6814, "grad_norm": 0.4297264516353607, "learning_rate": 0.0002, "epoch": 2.1035058430717863, "step": 1260}, {"loss": 1.6121, "grad_norm": 0.4579504132270813, "learning_rate": 0.0002, "epoch": 2.1202003338898163, "step": 1270}, {"loss": 1.6207, "grad_norm": 0.4223267138004303, "learning_rate": 0.0002, "epoch": 2.1368948247078463, "step": 1280}, {"loss": 1.6472, "grad_norm": 0.41538703441619873, "learning_rate": 0.0002, "epoch": 2.1535893155258763, "step": 1290}, {"loss": 1.6274, "grad_norm": 0.4987374544143677, "learning_rate": 0.0002, "epoch": 2.1702838063439067, "step": 1300}, {"loss": 1.6555, "grad_norm": 0.45300114154815674, "learning_rate": 0.0002, "epoch": 2.1869782971619367, "step": 1310}, {"loss": 1.6486, "grad_norm": 0.4577588737010956, "learning_rate": 0.0002, "epoch": 2.2036727879799667, "step": 1320}, {"loss": 1.644, "grad_norm": 0.4110747277736664, "learning_rate": 0.0002, "epoch": 2.2203672787979967, "step": 1330}, {"loss": 1.5875, "grad_norm": 0.5107163190841675, "learning_rate": 0.0002, "epoch": 2.2370617696160267, "step": 1340}, {"loss": 1.691, "grad_norm": 0.41190820932388306, "learning_rate": 0.0002, "epoch": 2.2537562604340566, "step": 1350}, {"loss": 1.5745, "grad_norm": 0.47458386421203613, "learning_rate": 0.0002, "epoch": 2.2704507512520866, "step": 1360}, {"loss": 1.6964, "grad_norm": 0.42136940360069275, "learning_rate": 0.0002, "epoch": 2.287145242070117, "step": 1370}, {"loss": 1.657, "grad_norm": 0.48292383551597595, "learning_rate": 0.0002, "epoch": 2.303839732888147, "step": 1380}, {"loss": 1.6816, "grad_norm": 0.4519229531288147, "learning_rate": 0.0002, "epoch": 2.320534223706177, "step": 1390}, {"loss": 1.6408, "grad_norm": 0.5860922336578369, "learning_rate": 0.0002, "epoch": 2.337228714524207, "step": 1400}, {"loss": 1.5551, "grad_norm": 0.4362313747406006, "learning_rate": 0.0002, "epoch": 2.353923205342237, "step": 1410}, {"loss": 1.6763, "grad_norm": 0.46916621923446655, "learning_rate": 0.0002, "epoch": 2.370617696160267, "step": 1420}, {"loss": 1.6082, "grad_norm": 0.5249663591384888, "learning_rate": 0.0002, "epoch": 2.3873121869782974, "step": 1430}, {"loss": 1.6793, "grad_norm": 0.4764375388622284, "learning_rate": 0.0002, "epoch": 2.4040066777963274, "step": 1440}, {"loss": 1.6395, "grad_norm": 0.46573784947395325, "learning_rate": 0.0002, "epoch": 2.4207011686143574, "step": 1450}, {"loss": 1.6629, "grad_norm": 0.44539371132850647, "learning_rate": 0.0002, "epoch": 2.4373956594323873, "step": 1460}, {"loss": 1.6149, "grad_norm": 0.40925896167755127, "learning_rate": 0.0002, "epoch": 2.4540901502504173, "step": 1470}, {"loss": 1.6213, "grad_norm": 0.4431462287902832, "learning_rate": 0.0002, "epoch": 2.4707846410684473, "step": 1480}, {"loss": 1.692, "grad_norm": 0.5476022362709045, "learning_rate": 0.0002, "epoch": 2.4874791318864773, "step": 1490}, {"loss": 1.5875, "grad_norm": 0.44762539863586426, "learning_rate": 0.0002, "epoch": 2.5041736227045073, "step": 1500}, {"loss": 1.6811, "grad_norm": 0.5470041632652283, "learning_rate": 0.0002, "epoch": 2.5208681135225377, "step": 1510}, {"loss": 1.7411, "grad_norm": 0.4739997088909149, "learning_rate": 0.0002, "epoch": 2.5375626043405677, "step": 1520}, {"loss": 1.5975, "grad_norm": 0.47115322947502136, "learning_rate": 0.0002, "epoch": 2.5542570951585977, "step": 1530}, {"loss": 1.6828, "grad_norm": 0.49705708026885986, "learning_rate": 0.0002, "epoch": 2.5709515859766277, "step": 1540}, {"loss": 1.7135, "grad_norm": 0.5537301301956177, "learning_rate": 0.0002, "epoch": 2.5876460767946576, "step": 1550}, {"loss": 1.5753, "grad_norm": 0.46930626034736633, "learning_rate": 0.0002, "epoch": 2.604340567612688, "step": 1560}, {"loss": 1.6706, "grad_norm": 0.42371469736099243, "learning_rate": 0.0002, "epoch": 2.621035058430718, "step": 1570}, {"loss": 1.6193, "grad_norm": 0.49005603790283203, "learning_rate": 0.0002, "epoch": 2.637729549248748, "step": 1580}, {"loss": 1.7138, "grad_norm": 0.4646829068660736, "learning_rate": 0.0002, "epoch": 2.654424040066778, "step": 1590}, {"loss": 1.6806, "grad_norm": 0.5091238617897034, "learning_rate": 0.0002, "epoch": 2.671118530884808, "step": 1600}, {"loss": 1.65, "grad_norm": 0.4889985918998718, "learning_rate": 0.0002, "epoch": 2.687813021702838, "step": 1610}, {"loss": 1.6573, "grad_norm": 0.5128234624862671, "learning_rate": 0.0002, "epoch": 2.704507512520868, "step": 1620}, {"loss": 1.6739, "grad_norm": 0.46999186277389526, "learning_rate": 0.0002, "epoch": 2.721202003338898, "step": 1630}, {"loss": 1.6277, "grad_norm": 0.4949921667575836, "learning_rate": 0.0002, "epoch": 2.7378964941569284, "step": 1640}, {"loss": 1.5958, "grad_norm": 0.4484370946884155, "learning_rate": 0.0002, "epoch": 2.7545909849749584, "step": 1650}, {"loss": 1.6156, "grad_norm": 0.45599570870399475, "learning_rate": 0.0002, "epoch": 2.7712854757929883, "step": 1660}, {"loss": 1.6037, "grad_norm": 0.5093285441398621, "learning_rate": 0.0002, "epoch": 2.7879799666110183, "step": 1670}, {"loss": 1.6675, "grad_norm": 0.44737935066223145, "learning_rate": 0.0002, "epoch": 2.8046744574290483, "step": 1680}, {"loss": 1.6429, "grad_norm": 0.4374251365661621, "learning_rate": 0.0002, "epoch": 2.8213689482470787, "step": 1690}, {"loss": 1.5941, "grad_norm": 0.44765740633010864, "learning_rate": 0.0002, "epoch": 2.8380634390651087, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.44685253500938416, "learning_rate": 0.0002, "epoch": 2.8547579298831387, "step": 1710}, {"loss": 1.6301, "grad_norm": 0.44777143001556396, "learning_rate": 0.0002, "epoch": 2.8714524207011687, "step": 1720}, {"loss": 1.6473, "grad_norm": 0.4178132712841034, "learning_rate": 0.0002, "epoch": 2.8881469115191987, "step": 1730}, {"loss": 1.6491, "grad_norm": 0.4487852156162262, "learning_rate": 0.0002, "epoch": 2.9048414023372287, "step": 1740}, {"loss": 1.6616, "grad_norm": 0.47137337923049927, "learning_rate": 0.0002, "epoch": 2.9215358931552586, "step": 1750}, {"loss": 1.6767, "grad_norm": 0.48543235659599304, "learning_rate": 0.0002, "epoch": 2.9382303839732886, "step": 1760}, {"loss": 1.7055, "grad_norm": 0.4174182116985321, "learning_rate": 0.0002, "epoch": 2.9549248747913186, "step": 1770}, {"loss": 1.5767, "grad_norm": 0.43385711312294006, "learning_rate": 0.0002, "epoch": 2.971619365609349, "step": 1780}, {"loss": 1.6215, "grad_norm": 0.474332332611084, "learning_rate": 0.0002, "epoch": 2.988313856427379, "step": 1790}, {"eval_loss": 1.8456445932388306, "eval_runtime": 87.6261, "eval_samples_per_second": 5.877, "eval_steps_per_second": 0.742, "epoch": 3.0, "step": 1797}, {"loss": 1.6224, "grad_norm": 0.40323764085769653, "learning_rate": 0.0002, "epoch": 3.005008347245409, "step": 1800}, {"loss": 1.5367, "grad_norm": 0.45069044828414917, "learning_rate": 0.0002, "epoch": 3.021702838063439, "step": 1810}, {"loss": 1.5271, "grad_norm": 0.6204925775527954, "learning_rate": 0.0002, "epoch": 3.038397328881469, "step": 1820}, {"loss": 1.5056, "grad_norm": 0.5857783555984497, "learning_rate": 0.0002, "epoch": 3.0550918196994994, "step": 1830}, {"loss": 1.5137, "grad_norm": 0.6776524782180786, "learning_rate": 0.0002, "epoch": 3.0717863105175294, "step": 1840}, {"loss": 1.5106, "grad_norm": 0.5486199855804443, "learning_rate": 0.0002, "epoch": 3.0884808013355594, "step": 1850}, {"loss": 1.414, "grad_norm": 0.5496503710746765, "learning_rate": 0.0002, "epoch": 3.1051752921535893, "step": 1860}, {"loss": 1.5181, "grad_norm": 0.5602648258209229, "learning_rate": 0.0002, "epoch": 3.1218697829716193, "step": 1870}, {"loss": 1.5406, "grad_norm": 1.0697380304336548, "learning_rate": 0.0002, "epoch": 3.1385642737896493, "step": 1880}, {"loss": 1.4889, "grad_norm": 0.6087332367897034, "learning_rate": 0.0002, "epoch": 3.1552587646076793, "step": 1890}, {"loss": 1.5219, "grad_norm": 0.5112161040306091, "learning_rate": 0.0002, "epoch": 3.1719532554257097, "step": 1900}, {"loss": 1.5139, "grad_norm": 0.6393680572509766, "learning_rate": 0.0002, "epoch": 3.1886477462437397, "step": 1910}, {"loss": 1.5337, "grad_norm": 0.7201815247535706, "learning_rate": 0.0002, "epoch": 3.2053422370617697, "step": 1920}, {"loss": 1.6055, "grad_norm": 0.5856018662452698, "learning_rate": 0.0002, "epoch": 3.2220367278797997, "step": 1930}, {"loss": 1.4791, "grad_norm": 0.581247866153717, "learning_rate": 0.0002, "epoch": 3.2387312186978297, "step": 1940}, {"loss": 1.5395, "grad_norm": 0.6055102944374084, "learning_rate": 0.0002, "epoch": 3.2554257095158596, "step": 1950}, {"loss": 1.5086, "grad_norm": 0.546894371509552, "learning_rate": 0.0002, "epoch": 3.27212020033389, "step": 1960}, {"loss": 1.5712, "grad_norm": 0.565558910369873, "learning_rate": 0.0002, "epoch": 3.28881469115192, "step": 1970}, {"loss": 1.47, "grad_norm": 1.2238883972167969, "learning_rate": 0.0002, "epoch": 3.30550918196995, "step": 1980}, {"loss": 1.4655, "grad_norm": 0.6362585425376892, "learning_rate": 0.0002, "epoch": 3.32220367278798, "step": 1990}, {"loss": 1.5157, "grad_norm": 0.6131124496459961, "learning_rate": 0.0002, "epoch": 3.33889816360601, "step": 2000}, {"loss": 1.5322, "grad_norm": 0.5181341767311096, "learning_rate": 0.0002, "epoch": 3.35559265442404, "step": 2010}, {"loss": 1.5039, "grad_norm": 0.6667609810829163, "learning_rate": 0.0002, "epoch": 3.37228714524207, "step": 2020}, {"loss": 1.5814, "grad_norm": 0.6488749980926514, "learning_rate": 0.0002, "epoch": 3.3889816360601, "step": 2030}, {"loss": 1.5226, "grad_norm": 0.5693286061286926, "learning_rate": 0.0002, "epoch": 3.4056761268781304, "step": 2040}, {"loss": 1.5121, "grad_norm": 0.6154143810272217, "learning_rate": 0.0002, "epoch": 3.4223706176961604, "step": 2050}, {"loss": 1.6033, "grad_norm": 0.6747981309890747, "learning_rate": 0.0002, "epoch": 3.4390651085141903, "step": 2060}, {"loss": 1.5857, "grad_norm": 0.5494789481163025, "learning_rate": 0.0002, "epoch": 3.4557595993322203, "step": 2070}, {"loss": 1.5223, "grad_norm": 2.481968402862549, "learning_rate": 0.0002, "epoch": 3.4724540901502503, "step": 2080}, {"loss": 1.4989, "grad_norm": 0.589784562587738, "learning_rate": 0.0002, "epoch": 3.4891485809682803, "step": 2090}, {"loss": 1.6227, "grad_norm": 0.6449820399284363, "learning_rate": 0.0002, "epoch": 3.5058430717863107, "step": 2100}, {"loss": 1.588, "grad_norm": 0.6467038989067078, "learning_rate": 0.0002, "epoch": 3.5225375626043407, "step": 2110}, {"loss": 1.5655, "grad_norm": 0.6533533334732056, "learning_rate": 0.0002, "epoch": 3.5392320534223707, "step": 2120}, {"loss": 1.6052, "grad_norm": 0.6804035902023315, "learning_rate": 0.0002, "epoch": 3.5559265442404007, "step": 2130}, {"loss": 1.5408, "grad_norm": 0.628773033618927, "learning_rate": 0.0002, "epoch": 3.5726210350584306, "step": 2140}, {"loss": 1.5487, "grad_norm": 0.6055739521980286, "learning_rate": 0.0002, "epoch": 3.5893155258764606, "step": 2150}, {"loss": 1.5305, "grad_norm": 0.6000894904136658, "learning_rate": 0.0002, "epoch": 3.6060100166944906, "step": 2160}, {"loss": 1.4742, "grad_norm": 0.5862473249435425, "learning_rate": 0.0002, "epoch": 3.6227045075125206, "step": 2170}, {"loss": 1.503, "grad_norm": 0.6547419428825378, "learning_rate": 0.0002, "epoch": 3.639398998330551, "step": 2180}, {"loss": 1.4704, "grad_norm": 0.5610318779945374, "learning_rate": 0.0002, "epoch": 3.656093489148581, "step": 2190}, {"loss": 1.4814, "grad_norm": 0.6387564539909363, "learning_rate": 0.0002, "epoch": 3.672787979966611, "step": 2200}, {"loss": 1.5356, "grad_norm": 0.6065090894699097, "learning_rate": 0.0002, "epoch": 3.689482470784641, "step": 2210}, {"loss": 1.5074, "grad_norm": 0.6266646981239319, "learning_rate": 0.0002, "epoch": 3.706176961602671, "step": 2220}, {"loss": 1.5146, "grad_norm": 0.626944363117218, "learning_rate": 0.0002, "epoch": 3.7228714524207014, "step": 2230}, {"loss": 1.5131, "grad_norm": 0.6043975949287415, "learning_rate": 0.0002, "epoch": 3.7395659432387314, "step": 2240}, {"loss": 1.5929, "grad_norm": 0.599732518196106, "learning_rate": 0.0002, "epoch": 3.7562604340567614, "step": 2250}, {"loss": 1.5236, "grad_norm": 0.6738389134407043, "learning_rate": 0.0002, "epoch": 3.7729549248747913, "step": 2260}, {"loss": 1.5003, "grad_norm": 0.5561335682868958, "learning_rate": 0.0002, "epoch": 3.7896494156928213, "step": 2270}, {"loss": 1.5013, "grad_norm": 0.6185726523399353, "learning_rate": 0.0002, "epoch": 3.8063439065108513, "step": 2280}, {"loss": 1.4996, "grad_norm": 0.6151532530784607, "learning_rate": 0.0002, "epoch": 3.8230383973288813, "step": 2290}, {"loss": 1.5453, "grad_norm": 0.5808233022689819, "learning_rate": 0.0002, "epoch": 3.8397328881469113, "step": 2300}, {"loss": 1.5223, "grad_norm": 0.6615163683891296, "learning_rate": 0.0002, "epoch": 3.8564273789649417, "step": 2310}, {"loss": 1.4365, "grad_norm": 0.5832979679107666, "learning_rate": 0.0002, "epoch": 3.8731218697829717, "step": 2320}, {"loss": 1.6036, "grad_norm": 0.6119300127029419, "learning_rate": 0.0002, "epoch": 3.8898163606010017, "step": 2330}, {"loss": 1.5581, "grad_norm": 0.6489697694778442, "learning_rate": 0.0002, "epoch": 3.9065108514190316, "step": 2340}, {"loss": 1.5601, "grad_norm": 0.5539063215255737, "learning_rate": 0.0002, "epoch": 3.9232053422370616, "step": 2350}, {"loss": 1.5174, "grad_norm": 0.6062877178192139, "learning_rate": 0.0002, "epoch": 3.939899833055092, "step": 2360}, {"loss": 1.5168, "grad_norm": 0.680609941482544, "learning_rate": 0.0002, "epoch": 3.956594323873122, "step": 2370}, {"loss": 1.4875, "grad_norm": 0.6176834106445312, "learning_rate": 0.0002, "epoch": 3.973288814691152, "step": 2380}, {"loss": 1.4984, "grad_norm": 0.6538102030754089, "learning_rate": 0.0002, "epoch": 3.989983305509182, "step": 2390}]} +{"epoch": 5.0, "step": 2995, "epoch_duration": 1497.549619436264, "total_accumulated_duration": 7294.067833662033, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}, {"eval_loss": 1.8236571550369263, "eval_runtime": 77.157, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.842, "epoch": 1.0, "step": 599}, {"loss": 1.7724, "grad_norm": 0.6550514698028564, "learning_rate": 0.0002, "epoch": 1.001669449081803, "step": 600}, {"loss": 1.7601, "grad_norm": 0.36824166774749756, "learning_rate": 0.0002, "epoch": 1.018363939899833, "step": 610}, {"loss": 1.7684, "grad_norm": 0.34707099199295044, "learning_rate": 0.0002, "epoch": 1.0350584307178632, "step": 620}, {"loss": 1.7552, "grad_norm": 0.38599663972854614, "learning_rate": 0.0002, "epoch": 1.0517529215358932, "step": 630}, {"loss": 1.7396, "grad_norm": 0.34381693601608276, "learning_rate": 0.0002, "epoch": 1.0684474123539232, "step": 640}, {"loss": 1.7621, "grad_norm": 0.3657481372356415, "learning_rate": 0.0002, "epoch": 1.0851419031719534, "step": 650}, {"loss": 1.723, "grad_norm": 0.3310803771018982, "learning_rate": 0.0002, "epoch": 1.1018363939899833, "step": 660}, {"loss": 1.7788, "grad_norm": 0.37122875452041626, "learning_rate": 0.0002, "epoch": 1.1185308848080133, "step": 670}, {"loss": 1.6887, "grad_norm": 0.3976633548736572, "learning_rate": 0.0002, "epoch": 1.1352253756260433, "step": 680}, {"loss": 1.7398, "grad_norm": 0.37567615509033203, "learning_rate": 0.0002, "epoch": 1.1519198664440735, "step": 690}, {"loss": 1.7326, "grad_norm": 0.3683645725250244, "learning_rate": 0.0002, "epoch": 1.1686143572621035, "step": 700}, {"loss": 1.8074, "grad_norm": 0.3862009644508362, "learning_rate": 0.0002, "epoch": 1.1853088480801335, "step": 710}, {"loss": 1.7993, "grad_norm": 0.3478439450263977, "learning_rate": 0.0002, "epoch": 1.2020033388981637, "step": 720}, {"loss": 1.7945, "grad_norm": 0.3694932162761688, "learning_rate": 0.0002, "epoch": 1.2186978297161937, "step": 730}, {"loss": 1.7269, "grad_norm": 0.3661787211894989, "learning_rate": 0.0002, "epoch": 1.2353923205342237, "step": 740}, {"loss": 1.7561, "grad_norm": 0.372951865196228, "learning_rate": 0.0002, "epoch": 1.2520868113522536, "step": 750}, {"loss": 1.7507, "grad_norm": 0.38718998432159424, "learning_rate": 0.0002, "epoch": 1.2687813021702838, "step": 760}, {"loss": 1.7331, "grad_norm": 0.37488260865211487, "learning_rate": 0.0002, "epoch": 1.2854757929883138, "step": 770}, {"loss": 1.7548, "grad_norm": 0.34794917702674866, "learning_rate": 0.0002, "epoch": 1.302170283806344, "step": 780}, {"loss": 1.7592, "grad_norm": 0.3627476990222931, "learning_rate": 0.0002, "epoch": 1.318864774624374, "step": 790}, {"loss": 1.6837, "grad_norm": 0.3773096799850464, "learning_rate": 0.0002, "epoch": 1.335559265442404, "step": 800}, {"loss": 1.7448, "grad_norm": 0.36476725339889526, "learning_rate": 0.0002, "epoch": 1.352253756260434, "step": 810}, {"loss": 1.7958, "grad_norm": 0.3767942190170288, "learning_rate": 0.0002, "epoch": 1.3689482470784642, "step": 820}, {"loss": 1.7241, "grad_norm": 0.3502795398235321, "learning_rate": 0.0002, "epoch": 1.3856427378964942, "step": 830}, {"loss": 1.736, "grad_norm": 0.4008622169494629, "learning_rate": 0.0002, "epoch": 1.4023372287145242, "step": 840}, {"loss": 1.7211, "grad_norm": 0.4029707610607147, "learning_rate": 0.0002, "epoch": 1.4190317195325544, "step": 850}, {"loss": 1.712, "grad_norm": 0.41480565071105957, "learning_rate": 0.0002, "epoch": 1.4357262103505843, "step": 860}, {"loss": 1.7287, "grad_norm": 0.4351646900177002, "learning_rate": 0.0002, "epoch": 1.4524207011686143, "step": 870}, {"loss": 1.8436, "grad_norm": 0.4053232967853546, "learning_rate": 0.0002, "epoch": 1.4691151919866443, "step": 880}, {"loss": 1.7669, "grad_norm": 0.3515186607837677, "learning_rate": 0.0002, "epoch": 1.4858096828046745, "step": 890}, {"loss": 1.698, "grad_norm": 0.42895469069480896, "learning_rate": 0.0002, "epoch": 1.5025041736227045, "step": 900}, {"loss": 1.7064, "grad_norm": 0.40897831320762634, "learning_rate": 0.0002, "epoch": 1.5191986644407347, "step": 910}, {"loss": 1.7539, "grad_norm": 0.3544739782810211, "learning_rate": 0.0002, "epoch": 1.5358931552587647, "step": 920}, {"loss": 1.7596, "grad_norm": 0.3848305642604828, "learning_rate": 0.0002, "epoch": 1.5525876460767947, "step": 930}, {"loss": 1.7344, "grad_norm": 0.36952173709869385, "learning_rate": 0.0002, "epoch": 1.5692821368948247, "step": 940}, {"loss": 1.7329, "grad_norm": 0.36505743861198425, "learning_rate": 0.0002, "epoch": 1.5859766277128546, "step": 950}, {"loss": 1.6828, "grad_norm": 0.3707764446735382, "learning_rate": 0.0002, "epoch": 1.6026711185308848, "step": 960}, {"loss": 1.7465, "grad_norm": 0.35995468497276306, "learning_rate": 0.0002, "epoch": 1.6193656093489148, "step": 970}, {"loss": 1.7537, "grad_norm": 0.35458096861839294, "learning_rate": 0.0002, "epoch": 1.636060100166945, "step": 980}, {"loss": 1.7075, "grad_norm": 0.3557756841182709, "learning_rate": 0.0002, "epoch": 1.652754590984975, "step": 990}, {"loss": 1.7307, "grad_norm": 0.355899453163147, "learning_rate": 0.0002, "epoch": 1.669449081803005, "step": 1000}, {"loss": 1.7701, "grad_norm": 0.3709148168563843, "learning_rate": 0.0002, "epoch": 1.686143572621035, "step": 1010}, {"loss": 1.7637, "grad_norm": 0.3731614947319031, "learning_rate": 0.0002, "epoch": 1.702838063439065, "step": 1020}, {"loss": 1.741, "grad_norm": 0.3639261722564697, "learning_rate": 0.0002, "epoch": 1.7195325542570952, "step": 1030}, {"loss": 1.727, "grad_norm": 0.36371079087257385, "learning_rate": 0.0002, "epoch": 1.7362270450751254, "step": 1040}, {"loss": 1.7275, "grad_norm": 0.38235539197921753, "learning_rate": 0.0002, "epoch": 1.7529215358931554, "step": 1050}, {"loss": 1.7304, "grad_norm": 0.4109364151954651, "learning_rate": 0.0002, "epoch": 1.7696160267111853, "step": 1060}, {"loss": 1.7244, "grad_norm": 0.3499647378921509, "learning_rate": 0.0002, "epoch": 1.7863105175292153, "step": 1070}, {"loss": 1.7117, "grad_norm": 0.3892260193824768, "learning_rate": 0.0002, "epoch": 1.8030050083472453, "step": 1080}, {"loss": 1.7764, "grad_norm": 0.3545094132423401, "learning_rate": 0.0002, "epoch": 1.8196994991652755, "step": 1090}, {"loss": 1.87, "grad_norm": 0.40419837832450867, "learning_rate": 0.0002, "epoch": 1.8363939899833055, "step": 1100}, {"loss": 1.6783, "grad_norm": 0.38423678278923035, "learning_rate": 0.0002, "epoch": 1.8530884808013357, "step": 1110}, {"loss": 1.7714, "grad_norm": 0.378408282995224, "learning_rate": 0.0002, "epoch": 1.8697829716193657, "step": 1120}, {"loss": 1.7461, "grad_norm": 0.4071941077709198, "learning_rate": 0.0002, "epoch": 1.8864774624373957, "step": 1130}, {"loss": 1.7281, "grad_norm": 0.42363739013671875, "learning_rate": 0.0002, "epoch": 1.9031719532554257, "step": 1140}, {"loss": 1.819, "grad_norm": 0.37373560667037964, "learning_rate": 0.0002, "epoch": 1.9198664440734556, "step": 1150}, {"loss": 1.7847, "grad_norm": 0.36408767104148865, "learning_rate": 0.0002, "epoch": 1.9365609348914858, "step": 1160}, {"loss": 1.6725, "grad_norm": 0.3795453906059265, "learning_rate": 0.0002, "epoch": 1.9532554257095158, "step": 1170}, {"loss": 1.7726, "grad_norm": 0.34415504336357117, "learning_rate": 0.0002, "epoch": 1.969949916527546, "step": 1180}, {"loss": 1.6199, "grad_norm": 0.3491021394729614, "learning_rate": 0.0002, "epoch": 1.986644407345576, "step": 1190}, {"eval_loss": 1.8182536363601685, "eval_runtime": 87.8767, "eval_samples_per_second": 5.86, "eval_steps_per_second": 0.74, "epoch": 2.0, "step": 1198}, {"loss": 1.5857, "grad_norm": 0.36758512258529663, "learning_rate": 0.0002, "epoch": 2.003338898163606, "step": 1200}, {"loss": 1.61, "grad_norm": 0.36278557777404785, "learning_rate": 0.0002, "epoch": 2.020033388981636, "step": 1210}, {"loss": 1.6733, "grad_norm": 0.4186977744102478, "learning_rate": 0.0002, "epoch": 2.036727879799666, "step": 1220}, {"loss": 1.6977, "grad_norm": 0.3958706855773926, "learning_rate": 0.0002, "epoch": 2.053422370617696, "step": 1230}, {"loss": 1.7054, "grad_norm": 0.43305638432502747, "learning_rate": 0.0002, "epoch": 2.0701168614357264, "step": 1240}, {"loss": 1.6275, "grad_norm": 0.4509678781032562, "learning_rate": 0.0002, "epoch": 2.0868113522537564, "step": 1250}, {"loss": 1.6814, "grad_norm": 0.4297264516353607, "learning_rate": 0.0002, "epoch": 2.1035058430717863, "step": 1260}, {"loss": 1.6121, "grad_norm": 0.4579504132270813, "learning_rate": 0.0002, "epoch": 2.1202003338898163, "step": 1270}, {"loss": 1.6207, "grad_norm": 0.4223267138004303, "learning_rate": 0.0002, "epoch": 2.1368948247078463, "step": 1280}, {"loss": 1.6472, "grad_norm": 0.41538703441619873, "learning_rate": 0.0002, "epoch": 2.1535893155258763, "step": 1290}, {"loss": 1.6274, "grad_norm": 0.4987374544143677, "learning_rate": 0.0002, "epoch": 2.1702838063439067, "step": 1300}, {"loss": 1.6555, "grad_norm": 0.45300114154815674, "learning_rate": 0.0002, "epoch": 2.1869782971619367, "step": 1310}, {"loss": 1.6486, "grad_norm": 0.4577588737010956, "learning_rate": 0.0002, "epoch": 2.2036727879799667, "step": 1320}, {"loss": 1.644, "grad_norm": 0.4110747277736664, "learning_rate": 0.0002, "epoch": 2.2203672787979967, "step": 1330}, {"loss": 1.5875, "grad_norm": 0.5107163190841675, "learning_rate": 0.0002, "epoch": 2.2370617696160267, "step": 1340}, {"loss": 1.691, "grad_norm": 0.41190820932388306, "learning_rate": 0.0002, "epoch": 2.2537562604340566, "step": 1350}, {"loss": 1.5745, "grad_norm": 0.47458386421203613, "learning_rate": 0.0002, "epoch": 2.2704507512520866, "step": 1360}, {"loss": 1.6964, "grad_norm": 0.42136940360069275, "learning_rate": 0.0002, "epoch": 2.287145242070117, "step": 1370}, {"loss": 1.657, "grad_norm": 0.48292383551597595, "learning_rate": 0.0002, "epoch": 2.303839732888147, "step": 1380}, {"loss": 1.6816, "grad_norm": 0.4519229531288147, "learning_rate": 0.0002, "epoch": 2.320534223706177, "step": 1390}, {"loss": 1.6408, "grad_norm": 0.5860922336578369, "learning_rate": 0.0002, "epoch": 2.337228714524207, "step": 1400}, {"loss": 1.5551, "grad_norm": 0.4362313747406006, "learning_rate": 0.0002, "epoch": 2.353923205342237, "step": 1410}, {"loss": 1.6763, "grad_norm": 0.46916621923446655, "learning_rate": 0.0002, "epoch": 2.370617696160267, "step": 1420}, {"loss": 1.6082, "grad_norm": 0.5249663591384888, "learning_rate": 0.0002, "epoch": 2.3873121869782974, "step": 1430}, {"loss": 1.6793, "grad_norm": 0.4764375388622284, "learning_rate": 0.0002, "epoch": 2.4040066777963274, "step": 1440}, {"loss": 1.6395, "grad_norm": 0.46573784947395325, "learning_rate": 0.0002, "epoch": 2.4207011686143574, "step": 1450}, {"loss": 1.6629, "grad_norm": 0.44539371132850647, "learning_rate": 0.0002, "epoch": 2.4373956594323873, "step": 1460}, {"loss": 1.6149, "grad_norm": 0.40925896167755127, "learning_rate": 0.0002, "epoch": 2.4540901502504173, "step": 1470}, {"loss": 1.6213, "grad_norm": 0.4431462287902832, "learning_rate": 0.0002, "epoch": 2.4707846410684473, "step": 1480}, {"loss": 1.692, "grad_norm": 0.5476022362709045, "learning_rate": 0.0002, "epoch": 2.4874791318864773, "step": 1490}, {"loss": 1.5875, "grad_norm": 0.44762539863586426, "learning_rate": 0.0002, "epoch": 2.5041736227045073, "step": 1500}, {"loss": 1.6811, "grad_norm": 0.5470041632652283, "learning_rate": 0.0002, "epoch": 2.5208681135225377, "step": 1510}, {"loss": 1.7411, "grad_norm": 0.4739997088909149, "learning_rate": 0.0002, "epoch": 2.5375626043405677, "step": 1520}, {"loss": 1.5975, "grad_norm": 0.47115322947502136, "learning_rate": 0.0002, "epoch": 2.5542570951585977, "step": 1530}, {"loss": 1.6828, "grad_norm": 0.49705708026885986, "learning_rate": 0.0002, "epoch": 2.5709515859766277, "step": 1540}, {"loss": 1.7135, "grad_norm": 0.5537301301956177, "learning_rate": 0.0002, "epoch": 2.5876460767946576, "step": 1550}, {"loss": 1.5753, "grad_norm": 0.46930626034736633, "learning_rate": 0.0002, "epoch": 2.604340567612688, "step": 1560}, {"loss": 1.6706, "grad_norm": 0.42371469736099243, "learning_rate": 0.0002, "epoch": 2.621035058430718, "step": 1570}, {"loss": 1.6193, "grad_norm": 0.49005603790283203, "learning_rate": 0.0002, "epoch": 2.637729549248748, "step": 1580}, {"loss": 1.7138, "grad_norm": 0.4646829068660736, "learning_rate": 0.0002, "epoch": 2.654424040066778, "step": 1590}, {"loss": 1.6806, "grad_norm": 0.5091238617897034, "learning_rate": 0.0002, "epoch": 2.671118530884808, "step": 1600}, {"loss": 1.65, "grad_norm": 0.4889985918998718, "learning_rate": 0.0002, "epoch": 2.687813021702838, "step": 1610}, {"loss": 1.6573, "grad_norm": 0.5128234624862671, "learning_rate": 0.0002, "epoch": 2.704507512520868, "step": 1620}, {"loss": 1.6739, "grad_norm": 0.46999186277389526, "learning_rate": 0.0002, "epoch": 2.721202003338898, "step": 1630}, {"loss": 1.6277, "grad_norm": 0.4949921667575836, "learning_rate": 0.0002, "epoch": 2.7378964941569284, "step": 1640}, {"loss": 1.5958, "grad_norm": 0.4484370946884155, "learning_rate": 0.0002, "epoch": 2.7545909849749584, "step": 1650}, {"loss": 1.6156, "grad_norm": 0.45599570870399475, "learning_rate": 0.0002, "epoch": 2.7712854757929883, "step": 1660}, {"loss": 1.6037, "grad_norm": 0.5093285441398621, "learning_rate": 0.0002, "epoch": 2.7879799666110183, "step": 1670}, {"loss": 1.6675, "grad_norm": 0.44737935066223145, "learning_rate": 0.0002, "epoch": 2.8046744574290483, "step": 1680}, {"loss": 1.6429, "grad_norm": 0.4374251365661621, "learning_rate": 0.0002, "epoch": 2.8213689482470787, "step": 1690}, {"loss": 1.5941, "grad_norm": 0.44765740633010864, "learning_rate": 0.0002, "epoch": 2.8380634390651087, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.44685253500938416, "learning_rate": 0.0002, "epoch": 2.8547579298831387, "step": 1710}, {"loss": 1.6301, "grad_norm": 0.44777143001556396, "learning_rate": 0.0002, "epoch": 2.8714524207011687, "step": 1720}, {"loss": 1.6473, "grad_norm": 0.4178132712841034, "learning_rate": 0.0002, "epoch": 2.8881469115191987, "step": 1730}, {"loss": 1.6491, "grad_norm": 0.4487852156162262, "learning_rate": 0.0002, "epoch": 2.9048414023372287, "step": 1740}, {"loss": 1.6616, "grad_norm": 0.47137337923049927, "learning_rate": 0.0002, "epoch": 2.9215358931552586, "step": 1750}, {"loss": 1.6767, "grad_norm": 0.48543235659599304, "learning_rate": 0.0002, "epoch": 2.9382303839732886, "step": 1760}, {"loss": 1.7055, "grad_norm": 0.4174182116985321, "learning_rate": 0.0002, "epoch": 2.9549248747913186, "step": 1770}, {"loss": 1.5767, "grad_norm": 0.43385711312294006, "learning_rate": 0.0002, "epoch": 2.971619365609349, "step": 1780}, {"loss": 1.6215, "grad_norm": 0.474332332611084, "learning_rate": 0.0002, "epoch": 2.988313856427379, "step": 1790}, {"eval_loss": 1.8456445932388306, "eval_runtime": 87.6261, "eval_samples_per_second": 5.877, "eval_steps_per_second": 0.742, "epoch": 3.0, "step": 1797}, {"loss": 1.6224, "grad_norm": 0.40323764085769653, "learning_rate": 0.0002, "epoch": 3.005008347245409, "step": 1800}, {"loss": 1.5367, "grad_norm": 0.45069044828414917, "learning_rate": 0.0002, "epoch": 3.021702838063439, "step": 1810}, {"loss": 1.5271, "grad_norm": 0.6204925775527954, "learning_rate": 0.0002, "epoch": 3.038397328881469, "step": 1820}, {"loss": 1.5056, "grad_norm": 0.5857783555984497, "learning_rate": 0.0002, "epoch": 3.0550918196994994, "step": 1830}, {"loss": 1.5137, "grad_norm": 0.6776524782180786, "learning_rate": 0.0002, "epoch": 3.0717863105175294, "step": 1840}, {"loss": 1.5106, "grad_norm": 0.5486199855804443, "learning_rate": 0.0002, "epoch": 3.0884808013355594, "step": 1850}, {"loss": 1.414, "grad_norm": 0.5496503710746765, "learning_rate": 0.0002, "epoch": 3.1051752921535893, "step": 1860}, {"loss": 1.5181, "grad_norm": 0.5602648258209229, "learning_rate": 0.0002, "epoch": 3.1218697829716193, "step": 1870}, {"loss": 1.5406, "grad_norm": 1.0697380304336548, "learning_rate": 0.0002, "epoch": 3.1385642737896493, "step": 1880}, {"loss": 1.4889, "grad_norm": 0.6087332367897034, "learning_rate": 0.0002, "epoch": 3.1552587646076793, "step": 1890}, {"loss": 1.5219, "grad_norm": 0.5112161040306091, "learning_rate": 0.0002, "epoch": 3.1719532554257097, "step": 1900}, {"loss": 1.5139, "grad_norm": 0.6393680572509766, "learning_rate": 0.0002, "epoch": 3.1886477462437397, "step": 1910}, {"loss": 1.5337, "grad_norm": 0.7201815247535706, "learning_rate": 0.0002, "epoch": 3.2053422370617697, "step": 1920}, {"loss": 1.6055, "grad_norm": 0.5856018662452698, "learning_rate": 0.0002, "epoch": 3.2220367278797997, "step": 1930}, {"loss": 1.4791, "grad_norm": 0.581247866153717, "learning_rate": 0.0002, "epoch": 3.2387312186978297, "step": 1940}, {"loss": 1.5395, "grad_norm": 0.6055102944374084, "learning_rate": 0.0002, "epoch": 3.2554257095158596, "step": 1950}, {"loss": 1.5086, "grad_norm": 0.546894371509552, "learning_rate": 0.0002, "epoch": 3.27212020033389, "step": 1960}, {"loss": 1.5712, "grad_norm": 0.565558910369873, "learning_rate": 0.0002, "epoch": 3.28881469115192, "step": 1970}, {"loss": 1.47, "grad_norm": 1.2238883972167969, "learning_rate": 0.0002, "epoch": 3.30550918196995, "step": 1980}, {"loss": 1.4655, "grad_norm": 0.6362585425376892, "learning_rate": 0.0002, "epoch": 3.32220367278798, "step": 1990}, {"loss": 1.5157, "grad_norm": 0.6131124496459961, "learning_rate": 0.0002, "epoch": 3.33889816360601, "step": 2000}, {"loss": 1.5322, "grad_norm": 0.5181341767311096, "learning_rate": 0.0002, "epoch": 3.35559265442404, "step": 2010}, {"loss": 1.5039, "grad_norm": 0.6667609810829163, "learning_rate": 0.0002, "epoch": 3.37228714524207, "step": 2020}, {"loss": 1.5814, "grad_norm": 0.6488749980926514, "learning_rate": 0.0002, "epoch": 3.3889816360601, "step": 2030}, {"loss": 1.5226, "grad_norm": 0.5693286061286926, "learning_rate": 0.0002, "epoch": 3.4056761268781304, "step": 2040}, {"loss": 1.5121, "grad_norm": 0.6154143810272217, "learning_rate": 0.0002, "epoch": 3.4223706176961604, "step": 2050}, {"loss": 1.6033, "grad_norm": 0.6747981309890747, "learning_rate": 0.0002, "epoch": 3.4390651085141903, "step": 2060}, {"loss": 1.5857, "grad_norm": 0.5494789481163025, "learning_rate": 0.0002, "epoch": 3.4557595993322203, "step": 2070}, {"loss": 1.5223, "grad_norm": 2.481968402862549, "learning_rate": 0.0002, "epoch": 3.4724540901502503, "step": 2080}, {"loss": 1.4989, "grad_norm": 0.589784562587738, "learning_rate": 0.0002, "epoch": 3.4891485809682803, "step": 2090}, {"loss": 1.6227, "grad_norm": 0.6449820399284363, "learning_rate": 0.0002, "epoch": 3.5058430717863107, "step": 2100}, {"loss": 1.588, "grad_norm": 0.6467038989067078, "learning_rate": 0.0002, "epoch": 3.5225375626043407, "step": 2110}, {"loss": 1.5655, "grad_norm": 0.6533533334732056, "learning_rate": 0.0002, "epoch": 3.5392320534223707, "step": 2120}, {"loss": 1.6052, "grad_norm": 0.6804035902023315, "learning_rate": 0.0002, "epoch": 3.5559265442404007, "step": 2130}, {"loss": 1.5408, "grad_norm": 0.628773033618927, "learning_rate": 0.0002, "epoch": 3.5726210350584306, "step": 2140}, {"loss": 1.5487, "grad_norm": 0.6055739521980286, "learning_rate": 0.0002, "epoch": 3.5893155258764606, "step": 2150}, {"loss": 1.5305, "grad_norm": 0.6000894904136658, "learning_rate": 0.0002, "epoch": 3.6060100166944906, "step": 2160}, {"loss": 1.4742, "grad_norm": 0.5862473249435425, "learning_rate": 0.0002, "epoch": 3.6227045075125206, "step": 2170}, {"loss": 1.503, "grad_norm": 0.6547419428825378, "learning_rate": 0.0002, "epoch": 3.639398998330551, "step": 2180}, {"loss": 1.4704, "grad_norm": 0.5610318779945374, "learning_rate": 0.0002, "epoch": 3.656093489148581, "step": 2190}, {"loss": 1.4814, "grad_norm": 0.6387564539909363, "learning_rate": 0.0002, "epoch": 3.672787979966611, "step": 2200}, {"loss": 1.5356, "grad_norm": 0.6065090894699097, "learning_rate": 0.0002, "epoch": 3.689482470784641, "step": 2210}, {"loss": 1.5074, "grad_norm": 0.6266646981239319, "learning_rate": 0.0002, "epoch": 3.706176961602671, "step": 2220}, {"loss": 1.5146, "grad_norm": 0.626944363117218, "learning_rate": 0.0002, "epoch": 3.7228714524207014, "step": 2230}, {"loss": 1.5131, "grad_norm": 0.6043975949287415, "learning_rate": 0.0002, "epoch": 3.7395659432387314, "step": 2240}, {"loss": 1.5929, "grad_norm": 0.599732518196106, "learning_rate": 0.0002, "epoch": 3.7562604340567614, "step": 2250}, {"loss": 1.5236, "grad_norm": 0.6738389134407043, "learning_rate": 0.0002, "epoch": 3.7729549248747913, "step": 2260}, {"loss": 1.5003, "grad_norm": 0.5561335682868958, "learning_rate": 0.0002, "epoch": 3.7896494156928213, "step": 2270}, {"loss": 1.5013, "grad_norm": 0.6185726523399353, "learning_rate": 0.0002, "epoch": 3.8063439065108513, "step": 2280}, {"loss": 1.4996, "grad_norm": 0.6151532530784607, "learning_rate": 0.0002, "epoch": 3.8230383973288813, "step": 2290}, {"loss": 1.5453, "grad_norm": 0.5808233022689819, "learning_rate": 0.0002, "epoch": 3.8397328881469113, "step": 2300}, {"loss": 1.5223, "grad_norm": 0.6615163683891296, "learning_rate": 0.0002, "epoch": 3.8564273789649417, "step": 2310}, {"loss": 1.4365, "grad_norm": 0.5832979679107666, "learning_rate": 0.0002, "epoch": 3.8731218697829717, "step": 2320}, {"loss": 1.6036, "grad_norm": 0.6119300127029419, "learning_rate": 0.0002, "epoch": 3.8898163606010017, "step": 2330}, {"loss": 1.5581, "grad_norm": 0.6489697694778442, "learning_rate": 0.0002, "epoch": 3.9065108514190316, "step": 2340}, {"loss": 1.5601, "grad_norm": 0.5539063215255737, "learning_rate": 0.0002, "epoch": 3.9232053422370616, "step": 2350}, {"loss": 1.5174, "grad_norm": 0.6062877178192139, "learning_rate": 0.0002, "epoch": 3.939899833055092, "step": 2360}, {"loss": 1.5168, "grad_norm": 0.680609941482544, "learning_rate": 0.0002, "epoch": 3.956594323873122, "step": 2370}, {"loss": 1.4875, "grad_norm": 0.6176834106445312, "learning_rate": 0.0002, "epoch": 3.973288814691152, "step": 2380}, {"loss": 1.4984, "grad_norm": 0.6538102030754089, "learning_rate": 0.0002, "epoch": 3.989983305509182, "step": 2390}, {"eval_loss": 1.8920671939849854, "eval_runtime": 76.5227, "eval_samples_per_second": 6.73, "eval_steps_per_second": 0.849, "epoch": 4.0, "step": 2396}, {"loss": 1.3926, "grad_norm": 0.5683762431144714, "learning_rate": 0.0002, "epoch": 4.006677796327212, "step": 2400}, {"loss": 1.3387, "grad_norm": 0.6858044862747192, "learning_rate": 0.0002, "epoch": 4.023372287145242, "step": 2410}, {"loss": 1.4495, "grad_norm": 0.7614858150482178, "learning_rate": 0.0002, "epoch": 4.040066777963272, "step": 2420}, {"loss": 1.2696, "grad_norm": 0.709412693977356, "learning_rate": 0.0002, "epoch": 4.056761268781302, "step": 2430}, {"loss": 1.3836, "grad_norm": 0.7070785760879517, "learning_rate": 0.0002, "epoch": 4.073455759599332, "step": 2440}, {"loss": 1.3527, "grad_norm": 0.8815216422080994, "learning_rate": 0.0002, "epoch": 4.090150250417362, "step": 2450}, {"loss": 1.3731, "grad_norm": 0.759981632232666, "learning_rate": 0.0002, "epoch": 4.106844741235392, "step": 2460}, {"loss": 1.3393, "grad_norm": 0.6715240478515625, "learning_rate": 0.0002, "epoch": 4.123539232053423, "step": 2470}, {"loss": 1.3934, "grad_norm": 0.7503564953804016, "learning_rate": 0.0002, "epoch": 4.140233722871453, "step": 2480}, {"loss": 1.324, "grad_norm": 0.773743748664856, "learning_rate": 0.0002, "epoch": 4.156928213689483, "step": 2490}, {"loss": 1.3782, "grad_norm": 0.8850100040435791, "learning_rate": 0.0002, "epoch": 4.173622704507513, "step": 2500}, {"loss": 1.3183, "grad_norm": 0.7575962543487549, "learning_rate": 0.0002, "epoch": 4.190317195325543, "step": 2510}, {"loss": 1.3673, "grad_norm": 0.9117498397827148, "learning_rate": 0.0002, "epoch": 4.207011686143573, "step": 2520}, {"loss": 1.3242, "grad_norm": 0.7637559175491333, "learning_rate": 0.0002, "epoch": 4.223706176961603, "step": 2530}, {"loss": 1.3764, "grad_norm": 0.8178390264511108, "learning_rate": 0.0002, "epoch": 4.240400667779633, "step": 2540}, {"loss": 1.3808, "grad_norm": 0.8299263119697571, "learning_rate": 0.0002, "epoch": 4.257095158597663, "step": 2550}, {"loss": 1.3637, "grad_norm": 0.7238091230392456, "learning_rate": 0.0002, "epoch": 4.273789649415693, "step": 2560}, {"loss": 1.349, "grad_norm": 0.7468036413192749, "learning_rate": 0.0002, "epoch": 4.290484140233723, "step": 2570}, {"loss": 1.4422, "grad_norm": 0.8012791275978088, "learning_rate": 0.0002, "epoch": 4.307178631051753, "step": 2580}, {"loss": 1.3723, "grad_norm": 0.8302484154701233, "learning_rate": 0.0002, "epoch": 4.323873121869783, "step": 2590}, {"loss": 1.4013, "grad_norm": 0.751864492893219, "learning_rate": 0.0002, "epoch": 4.340567612687813, "step": 2600}, {"loss": 1.3881, "grad_norm": 0.8025410175323486, "learning_rate": 0.0002, "epoch": 4.357262103505843, "step": 2610}, {"loss": 1.3831, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 4.373956594323873, "step": 2620}, {"loss": 1.3721, "grad_norm": 0.8526890873908997, "learning_rate": 0.0002, "epoch": 4.390651085141903, "step": 2630}, {"loss": 1.4253, "grad_norm": 1.0536625385284424, "learning_rate": 0.0002, "epoch": 4.407345575959933, "step": 2640}, {"loss": 1.3736, "grad_norm": 0.7223818898200989, "learning_rate": 0.0002, "epoch": 4.424040066777963, "step": 2650}, {"loss": 1.4652, "grad_norm": 0.7981253266334534, "learning_rate": 0.0002, "epoch": 4.440734557595993, "step": 2660}, {"loss": 1.3878, "grad_norm": 0.7136162519454956, "learning_rate": 0.0002, "epoch": 4.457429048414023, "step": 2670}, {"loss": 1.4242, "grad_norm": 0.8008312582969666, "learning_rate": 0.0002, "epoch": 4.474123539232053, "step": 2680}, {"loss": 1.3448, "grad_norm": 0.7924065589904785, "learning_rate": 0.0002, "epoch": 4.490818030050083, "step": 2690}, {"loss": 1.402, "grad_norm": 0.8224287629127502, "learning_rate": 0.0002, "epoch": 4.507512520868113, "step": 2700}, {"loss": 1.2841, "grad_norm": 0.7494375109672546, "learning_rate": 0.0002, "epoch": 4.524207011686143, "step": 2710}, {"loss": 1.4471, "grad_norm": 0.8097899556159973, "learning_rate": 0.0002, "epoch": 4.540901502504173, "step": 2720}, {"loss": 1.4116, "grad_norm": 0.7728819251060486, "learning_rate": 0.0002, "epoch": 4.557595993322204, "step": 2730}, {"loss": 1.3549, "grad_norm": 0.9112362265586853, "learning_rate": 0.0002, "epoch": 4.574290484140234, "step": 2740}, {"loss": 1.4601, "grad_norm": 0.7502672076225281, "learning_rate": 0.0002, "epoch": 4.590984974958264, "step": 2750}, {"loss": 1.4216, "grad_norm": 0.8816406726837158, "learning_rate": 0.0002, "epoch": 4.607679465776294, "step": 2760}, {"loss": 1.3233, "grad_norm": 0.7117180228233337, "learning_rate": 0.0002, "epoch": 4.624373956594324, "step": 2770}, {"loss": 1.3886, "grad_norm": 0.8224529027938843, "learning_rate": 0.0002, "epoch": 4.641068447412354, "step": 2780}, {"loss": 1.3756, "grad_norm": 0.7625266313552856, "learning_rate": 0.0002, "epoch": 4.657762938230384, "step": 2790}, {"loss": 1.3953, "grad_norm": 0.7754318118095398, "learning_rate": 0.0002, "epoch": 4.674457429048414, "step": 2800}, {"loss": 1.4102, "grad_norm": 0.7907336354255676, "learning_rate": 0.0002, "epoch": 4.691151919866444, "step": 2810}, {"loss": 1.3277, "grad_norm": 0.7377734780311584, "learning_rate": 0.0002, "epoch": 4.707846410684474, "step": 2820}, {"loss": 1.3686, "grad_norm": 0.7380456328392029, "learning_rate": 0.0002, "epoch": 4.724540901502504, "step": 2830}, {"loss": 1.4405, "grad_norm": 0.7148023247718811, "learning_rate": 0.0002, "epoch": 4.741235392320534, "step": 2840}, {"loss": 1.4025, "grad_norm": 0.807048499584198, "learning_rate": 0.0002, "epoch": 4.757929883138564, "step": 2850}, {"loss": 1.3195, "grad_norm": 0.8444154858589172, "learning_rate": 0.0002, "epoch": 4.774624373956595, "step": 2860}, {"loss": 1.4282, "grad_norm": 0.8328704237937927, "learning_rate": 0.0002, "epoch": 4.791318864774624, "step": 2870}, {"loss": 1.413, "grad_norm": 0.89827960729599, "learning_rate": 0.0002, "epoch": 4.808013355592655, "step": 2880}, {"loss": 1.4488, "grad_norm": 0.7848225831985474, "learning_rate": 0.0002, "epoch": 4.824707846410685, "step": 2890}, {"loss": 1.3757, "grad_norm": 0.703802227973938, "learning_rate": 0.0002, "epoch": 4.841402337228715, "step": 2900}, {"loss": 1.4404, "grad_norm": 0.8092581629753113, "learning_rate": 0.0002, "epoch": 4.858096828046745, "step": 2910}, {"loss": 1.3812, "grad_norm": 0.7537722587585449, "learning_rate": 0.0002, "epoch": 4.874791318864775, "step": 2920}, {"loss": 1.4499, "grad_norm": 0.7966470122337341, "learning_rate": 0.0002, "epoch": 4.891485809682805, "step": 2930}, {"loss": 1.3922, "grad_norm": 0.7860329747200012, "learning_rate": 0.0002, "epoch": 4.908180300500835, "step": 2940}, {"loss": 1.4224, "grad_norm": 0.7964439988136292, "learning_rate": 0.0002, "epoch": 4.924874791318865, "step": 2950}, {"loss": 1.3869, "grad_norm": 0.740288257598877, "learning_rate": 0.0002, "epoch": 4.941569282136895, "step": 2960}, {"loss": 1.4321, "grad_norm": 0.7377685904502869, "learning_rate": 0.0002, "epoch": 4.958263772954925, "step": 2970}, {"loss": 1.4253, "grad_norm": 0.793484628200531, "learning_rate": 0.0002, "epoch": 4.974958263772955, "step": 2980}, {"loss": 1.3966, "grad_norm": 0.7710573077201843, "learning_rate": 0.0002, "epoch": 4.9916527545909855, "step": 2990}]} +{"epoch": 6.0, "step": 3594, "epoch_duration": 1533.8069713115692, "total_accumulated_duration": 8827.874804973602, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}, {"eval_loss": 1.8236571550369263, "eval_runtime": 77.157, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.842, "epoch": 1.0, "step": 599}, {"loss": 1.7724, "grad_norm": 0.6550514698028564, "learning_rate": 0.0002, "epoch": 1.001669449081803, "step": 600}, {"loss": 1.7601, "grad_norm": 0.36824166774749756, "learning_rate": 0.0002, "epoch": 1.018363939899833, "step": 610}, {"loss": 1.7684, "grad_norm": 0.34707099199295044, "learning_rate": 0.0002, "epoch": 1.0350584307178632, "step": 620}, {"loss": 1.7552, "grad_norm": 0.38599663972854614, "learning_rate": 0.0002, "epoch": 1.0517529215358932, "step": 630}, {"loss": 1.7396, "grad_norm": 0.34381693601608276, "learning_rate": 0.0002, "epoch": 1.0684474123539232, "step": 640}, {"loss": 1.7621, "grad_norm": 0.3657481372356415, "learning_rate": 0.0002, "epoch": 1.0851419031719534, "step": 650}, {"loss": 1.723, "grad_norm": 0.3310803771018982, "learning_rate": 0.0002, "epoch": 1.1018363939899833, "step": 660}, {"loss": 1.7788, "grad_norm": 0.37122875452041626, "learning_rate": 0.0002, "epoch": 1.1185308848080133, "step": 670}, {"loss": 1.6887, "grad_norm": 0.3976633548736572, "learning_rate": 0.0002, "epoch": 1.1352253756260433, "step": 680}, {"loss": 1.7398, "grad_norm": 0.37567615509033203, "learning_rate": 0.0002, "epoch": 1.1519198664440735, "step": 690}, {"loss": 1.7326, "grad_norm": 0.3683645725250244, "learning_rate": 0.0002, "epoch": 1.1686143572621035, "step": 700}, {"loss": 1.8074, "grad_norm": 0.3862009644508362, "learning_rate": 0.0002, "epoch": 1.1853088480801335, "step": 710}, {"loss": 1.7993, "grad_norm": 0.3478439450263977, "learning_rate": 0.0002, "epoch": 1.2020033388981637, "step": 720}, {"loss": 1.7945, "grad_norm": 0.3694932162761688, "learning_rate": 0.0002, "epoch": 1.2186978297161937, "step": 730}, {"loss": 1.7269, "grad_norm": 0.3661787211894989, "learning_rate": 0.0002, "epoch": 1.2353923205342237, "step": 740}, {"loss": 1.7561, "grad_norm": 0.372951865196228, "learning_rate": 0.0002, "epoch": 1.2520868113522536, "step": 750}, {"loss": 1.7507, "grad_norm": 0.38718998432159424, "learning_rate": 0.0002, "epoch": 1.2687813021702838, "step": 760}, {"loss": 1.7331, "grad_norm": 0.37488260865211487, "learning_rate": 0.0002, "epoch": 1.2854757929883138, "step": 770}, {"loss": 1.7548, "grad_norm": 0.34794917702674866, "learning_rate": 0.0002, "epoch": 1.302170283806344, "step": 780}, {"loss": 1.7592, "grad_norm": 0.3627476990222931, "learning_rate": 0.0002, "epoch": 1.318864774624374, "step": 790}, {"loss": 1.6837, "grad_norm": 0.3773096799850464, "learning_rate": 0.0002, "epoch": 1.335559265442404, "step": 800}, {"loss": 1.7448, "grad_norm": 0.36476725339889526, "learning_rate": 0.0002, "epoch": 1.352253756260434, "step": 810}, {"loss": 1.7958, "grad_norm": 0.3767942190170288, "learning_rate": 0.0002, "epoch": 1.3689482470784642, "step": 820}, {"loss": 1.7241, "grad_norm": 0.3502795398235321, "learning_rate": 0.0002, "epoch": 1.3856427378964942, "step": 830}, {"loss": 1.736, "grad_norm": 0.4008622169494629, "learning_rate": 0.0002, "epoch": 1.4023372287145242, "step": 840}, {"loss": 1.7211, "grad_norm": 0.4029707610607147, "learning_rate": 0.0002, "epoch": 1.4190317195325544, "step": 850}, {"loss": 1.712, "grad_norm": 0.41480565071105957, "learning_rate": 0.0002, "epoch": 1.4357262103505843, "step": 860}, {"loss": 1.7287, "grad_norm": 0.4351646900177002, "learning_rate": 0.0002, "epoch": 1.4524207011686143, "step": 870}, {"loss": 1.8436, "grad_norm": 0.4053232967853546, "learning_rate": 0.0002, "epoch": 1.4691151919866443, "step": 880}, {"loss": 1.7669, "grad_norm": 0.3515186607837677, "learning_rate": 0.0002, "epoch": 1.4858096828046745, "step": 890}, {"loss": 1.698, "grad_norm": 0.42895469069480896, "learning_rate": 0.0002, "epoch": 1.5025041736227045, "step": 900}, {"loss": 1.7064, "grad_norm": 0.40897831320762634, "learning_rate": 0.0002, "epoch": 1.5191986644407347, "step": 910}, {"loss": 1.7539, "grad_norm": 0.3544739782810211, "learning_rate": 0.0002, "epoch": 1.5358931552587647, "step": 920}, {"loss": 1.7596, "grad_norm": 0.3848305642604828, "learning_rate": 0.0002, "epoch": 1.5525876460767947, "step": 930}, {"loss": 1.7344, "grad_norm": 0.36952173709869385, "learning_rate": 0.0002, "epoch": 1.5692821368948247, "step": 940}, {"loss": 1.7329, "grad_norm": 0.36505743861198425, "learning_rate": 0.0002, "epoch": 1.5859766277128546, "step": 950}, {"loss": 1.6828, "grad_norm": 0.3707764446735382, "learning_rate": 0.0002, "epoch": 1.6026711185308848, "step": 960}, {"loss": 1.7465, "grad_norm": 0.35995468497276306, "learning_rate": 0.0002, "epoch": 1.6193656093489148, "step": 970}, {"loss": 1.7537, "grad_norm": 0.35458096861839294, "learning_rate": 0.0002, "epoch": 1.636060100166945, "step": 980}, {"loss": 1.7075, "grad_norm": 0.3557756841182709, "learning_rate": 0.0002, "epoch": 1.652754590984975, "step": 990}, {"loss": 1.7307, "grad_norm": 0.355899453163147, "learning_rate": 0.0002, "epoch": 1.669449081803005, "step": 1000}, {"loss": 1.7701, "grad_norm": 0.3709148168563843, "learning_rate": 0.0002, "epoch": 1.686143572621035, "step": 1010}, {"loss": 1.7637, "grad_norm": 0.3731614947319031, "learning_rate": 0.0002, "epoch": 1.702838063439065, "step": 1020}, {"loss": 1.741, "grad_norm": 0.3639261722564697, "learning_rate": 0.0002, "epoch": 1.7195325542570952, "step": 1030}, {"loss": 1.727, "grad_norm": 0.36371079087257385, "learning_rate": 0.0002, "epoch": 1.7362270450751254, "step": 1040}, {"loss": 1.7275, "grad_norm": 0.38235539197921753, "learning_rate": 0.0002, "epoch": 1.7529215358931554, "step": 1050}, {"loss": 1.7304, "grad_norm": 0.4109364151954651, "learning_rate": 0.0002, "epoch": 1.7696160267111853, "step": 1060}, {"loss": 1.7244, "grad_norm": 0.3499647378921509, "learning_rate": 0.0002, "epoch": 1.7863105175292153, "step": 1070}, {"loss": 1.7117, "grad_norm": 0.3892260193824768, "learning_rate": 0.0002, "epoch": 1.8030050083472453, "step": 1080}, {"loss": 1.7764, "grad_norm": 0.3545094132423401, "learning_rate": 0.0002, "epoch": 1.8196994991652755, "step": 1090}, {"loss": 1.87, "grad_norm": 0.40419837832450867, "learning_rate": 0.0002, "epoch": 1.8363939899833055, "step": 1100}, {"loss": 1.6783, "grad_norm": 0.38423678278923035, "learning_rate": 0.0002, "epoch": 1.8530884808013357, "step": 1110}, {"loss": 1.7714, "grad_norm": 0.378408282995224, "learning_rate": 0.0002, "epoch": 1.8697829716193657, "step": 1120}, {"loss": 1.7461, "grad_norm": 0.4071941077709198, "learning_rate": 0.0002, "epoch": 1.8864774624373957, "step": 1130}, {"loss": 1.7281, "grad_norm": 0.42363739013671875, "learning_rate": 0.0002, "epoch": 1.9031719532554257, "step": 1140}, {"loss": 1.819, "grad_norm": 0.37373560667037964, "learning_rate": 0.0002, "epoch": 1.9198664440734556, "step": 1150}, {"loss": 1.7847, "grad_norm": 0.36408767104148865, "learning_rate": 0.0002, "epoch": 1.9365609348914858, "step": 1160}, {"loss": 1.6725, "grad_norm": 0.3795453906059265, "learning_rate": 0.0002, "epoch": 1.9532554257095158, "step": 1170}, {"loss": 1.7726, "grad_norm": 0.34415504336357117, "learning_rate": 0.0002, "epoch": 1.969949916527546, "step": 1180}, {"loss": 1.6199, "grad_norm": 0.3491021394729614, "learning_rate": 0.0002, "epoch": 1.986644407345576, "step": 1190}, {"eval_loss": 1.8182536363601685, "eval_runtime": 87.8767, "eval_samples_per_second": 5.86, "eval_steps_per_second": 0.74, "epoch": 2.0, "step": 1198}, {"loss": 1.5857, "grad_norm": 0.36758512258529663, "learning_rate": 0.0002, "epoch": 2.003338898163606, "step": 1200}, {"loss": 1.61, "grad_norm": 0.36278557777404785, "learning_rate": 0.0002, "epoch": 2.020033388981636, "step": 1210}, {"loss": 1.6733, "grad_norm": 0.4186977744102478, "learning_rate": 0.0002, "epoch": 2.036727879799666, "step": 1220}, {"loss": 1.6977, "grad_norm": 0.3958706855773926, "learning_rate": 0.0002, "epoch": 2.053422370617696, "step": 1230}, {"loss": 1.7054, "grad_norm": 0.43305638432502747, "learning_rate": 0.0002, "epoch": 2.0701168614357264, "step": 1240}, {"loss": 1.6275, "grad_norm": 0.4509678781032562, "learning_rate": 0.0002, "epoch": 2.0868113522537564, "step": 1250}, {"loss": 1.6814, "grad_norm": 0.4297264516353607, "learning_rate": 0.0002, "epoch": 2.1035058430717863, "step": 1260}, {"loss": 1.6121, "grad_norm": 0.4579504132270813, "learning_rate": 0.0002, "epoch": 2.1202003338898163, "step": 1270}, {"loss": 1.6207, "grad_norm": 0.4223267138004303, "learning_rate": 0.0002, "epoch": 2.1368948247078463, "step": 1280}, {"loss": 1.6472, "grad_norm": 0.41538703441619873, "learning_rate": 0.0002, "epoch": 2.1535893155258763, "step": 1290}, {"loss": 1.6274, "grad_norm": 0.4987374544143677, "learning_rate": 0.0002, "epoch": 2.1702838063439067, "step": 1300}, {"loss": 1.6555, "grad_norm": 0.45300114154815674, "learning_rate": 0.0002, "epoch": 2.1869782971619367, "step": 1310}, {"loss": 1.6486, "grad_norm": 0.4577588737010956, "learning_rate": 0.0002, "epoch": 2.2036727879799667, "step": 1320}, {"loss": 1.644, "grad_norm": 0.4110747277736664, "learning_rate": 0.0002, "epoch": 2.2203672787979967, "step": 1330}, {"loss": 1.5875, "grad_norm": 0.5107163190841675, "learning_rate": 0.0002, "epoch": 2.2370617696160267, "step": 1340}, {"loss": 1.691, "grad_norm": 0.41190820932388306, "learning_rate": 0.0002, "epoch": 2.2537562604340566, "step": 1350}, {"loss": 1.5745, "grad_norm": 0.47458386421203613, "learning_rate": 0.0002, "epoch": 2.2704507512520866, "step": 1360}, {"loss": 1.6964, "grad_norm": 0.42136940360069275, "learning_rate": 0.0002, "epoch": 2.287145242070117, "step": 1370}, {"loss": 1.657, "grad_norm": 0.48292383551597595, "learning_rate": 0.0002, "epoch": 2.303839732888147, "step": 1380}, {"loss": 1.6816, "grad_norm": 0.4519229531288147, "learning_rate": 0.0002, "epoch": 2.320534223706177, "step": 1390}, {"loss": 1.6408, "grad_norm": 0.5860922336578369, "learning_rate": 0.0002, "epoch": 2.337228714524207, "step": 1400}, {"loss": 1.5551, "grad_norm": 0.4362313747406006, "learning_rate": 0.0002, "epoch": 2.353923205342237, "step": 1410}, {"loss": 1.6763, "grad_norm": 0.46916621923446655, "learning_rate": 0.0002, "epoch": 2.370617696160267, "step": 1420}, {"loss": 1.6082, "grad_norm": 0.5249663591384888, "learning_rate": 0.0002, "epoch": 2.3873121869782974, "step": 1430}, {"loss": 1.6793, "grad_norm": 0.4764375388622284, "learning_rate": 0.0002, "epoch": 2.4040066777963274, "step": 1440}, {"loss": 1.6395, "grad_norm": 0.46573784947395325, "learning_rate": 0.0002, "epoch": 2.4207011686143574, "step": 1450}, {"loss": 1.6629, "grad_norm": 0.44539371132850647, "learning_rate": 0.0002, "epoch": 2.4373956594323873, "step": 1460}, {"loss": 1.6149, "grad_norm": 0.40925896167755127, "learning_rate": 0.0002, "epoch": 2.4540901502504173, "step": 1470}, {"loss": 1.6213, "grad_norm": 0.4431462287902832, "learning_rate": 0.0002, "epoch": 2.4707846410684473, "step": 1480}, {"loss": 1.692, "grad_norm": 0.5476022362709045, "learning_rate": 0.0002, "epoch": 2.4874791318864773, "step": 1490}, {"loss": 1.5875, "grad_norm": 0.44762539863586426, "learning_rate": 0.0002, "epoch": 2.5041736227045073, "step": 1500}, {"loss": 1.6811, "grad_norm": 0.5470041632652283, "learning_rate": 0.0002, "epoch": 2.5208681135225377, "step": 1510}, {"loss": 1.7411, "grad_norm": 0.4739997088909149, "learning_rate": 0.0002, "epoch": 2.5375626043405677, "step": 1520}, {"loss": 1.5975, "grad_norm": 0.47115322947502136, "learning_rate": 0.0002, "epoch": 2.5542570951585977, "step": 1530}, {"loss": 1.6828, "grad_norm": 0.49705708026885986, "learning_rate": 0.0002, "epoch": 2.5709515859766277, "step": 1540}, {"loss": 1.7135, "grad_norm": 0.5537301301956177, "learning_rate": 0.0002, "epoch": 2.5876460767946576, "step": 1550}, {"loss": 1.5753, "grad_norm": 0.46930626034736633, "learning_rate": 0.0002, "epoch": 2.604340567612688, "step": 1560}, {"loss": 1.6706, "grad_norm": 0.42371469736099243, "learning_rate": 0.0002, "epoch": 2.621035058430718, "step": 1570}, {"loss": 1.6193, "grad_norm": 0.49005603790283203, "learning_rate": 0.0002, "epoch": 2.637729549248748, "step": 1580}, {"loss": 1.7138, "grad_norm": 0.4646829068660736, "learning_rate": 0.0002, "epoch": 2.654424040066778, "step": 1590}, {"loss": 1.6806, "grad_norm": 0.5091238617897034, "learning_rate": 0.0002, "epoch": 2.671118530884808, "step": 1600}, {"loss": 1.65, "grad_norm": 0.4889985918998718, "learning_rate": 0.0002, "epoch": 2.687813021702838, "step": 1610}, {"loss": 1.6573, "grad_norm": 0.5128234624862671, "learning_rate": 0.0002, "epoch": 2.704507512520868, "step": 1620}, {"loss": 1.6739, "grad_norm": 0.46999186277389526, "learning_rate": 0.0002, "epoch": 2.721202003338898, "step": 1630}, {"loss": 1.6277, "grad_norm": 0.4949921667575836, "learning_rate": 0.0002, "epoch": 2.7378964941569284, "step": 1640}, {"loss": 1.5958, "grad_norm": 0.4484370946884155, "learning_rate": 0.0002, "epoch": 2.7545909849749584, "step": 1650}, {"loss": 1.6156, "grad_norm": 0.45599570870399475, "learning_rate": 0.0002, "epoch": 2.7712854757929883, "step": 1660}, {"loss": 1.6037, "grad_norm": 0.5093285441398621, "learning_rate": 0.0002, "epoch": 2.7879799666110183, "step": 1670}, {"loss": 1.6675, "grad_norm": 0.44737935066223145, "learning_rate": 0.0002, "epoch": 2.8046744574290483, "step": 1680}, {"loss": 1.6429, "grad_norm": 0.4374251365661621, "learning_rate": 0.0002, "epoch": 2.8213689482470787, "step": 1690}, {"loss": 1.5941, "grad_norm": 0.44765740633010864, "learning_rate": 0.0002, "epoch": 2.8380634390651087, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.44685253500938416, "learning_rate": 0.0002, "epoch": 2.8547579298831387, "step": 1710}, {"loss": 1.6301, "grad_norm": 0.44777143001556396, "learning_rate": 0.0002, "epoch": 2.8714524207011687, "step": 1720}, {"loss": 1.6473, "grad_norm": 0.4178132712841034, "learning_rate": 0.0002, "epoch": 2.8881469115191987, "step": 1730}, {"loss": 1.6491, "grad_norm": 0.4487852156162262, "learning_rate": 0.0002, "epoch": 2.9048414023372287, "step": 1740}, {"loss": 1.6616, "grad_norm": 0.47137337923049927, "learning_rate": 0.0002, "epoch": 2.9215358931552586, "step": 1750}, {"loss": 1.6767, "grad_norm": 0.48543235659599304, "learning_rate": 0.0002, "epoch": 2.9382303839732886, "step": 1760}, {"loss": 1.7055, "grad_norm": 0.4174182116985321, "learning_rate": 0.0002, "epoch": 2.9549248747913186, "step": 1770}, {"loss": 1.5767, "grad_norm": 0.43385711312294006, "learning_rate": 0.0002, "epoch": 2.971619365609349, "step": 1780}, {"loss": 1.6215, "grad_norm": 0.474332332611084, "learning_rate": 0.0002, "epoch": 2.988313856427379, "step": 1790}, {"eval_loss": 1.8456445932388306, "eval_runtime": 87.6261, "eval_samples_per_second": 5.877, "eval_steps_per_second": 0.742, "epoch": 3.0, "step": 1797}, {"loss": 1.6224, "grad_norm": 0.40323764085769653, "learning_rate": 0.0002, "epoch": 3.005008347245409, "step": 1800}, {"loss": 1.5367, "grad_norm": 0.45069044828414917, "learning_rate": 0.0002, "epoch": 3.021702838063439, "step": 1810}, {"loss": 1.5271, "grad_norm": 0.6204925775527954, "learning_rate": 0.0002, "epoch": 3.038397328881469, "step": 1820}, {"loss": 1.5056, "grad_norm": 0.5857783555984497, "learning_rate": 0.0002, "epoch": 3.0550918196994994, "step": 1830}, {"loss": 1.5137, "grad_norm": 0.6776524782180786, "learning_rate": 0.0002, "epoch": 3.0717863105175294, "step": 1840}, {"loss": 1.5106, "grad_norm": 0.5486199855804443, "learning_rate": 0.0002, "epoch": 3.0884808013355594, "step": 1850}, {"loss": 1.414, "grad_norm": 0.5496503710746765, "learning_rate": 0.0002, "epoch": 3.1051752921535893, "step": 1860}, {"loss": 1.5181, "grad_norm": 0.5602648258209229, "learning_rate": 0.0002, "epoch": 3.1218697829716193, "step": 1870}, {"loss": 1.5406, "grad_norm": 1.0697380304336548, "learning_rate": 0.0002, "epoch": 3.1385642737896493, "step": 1880}, {"loss": 1.4889, "grad_norm": 0.6087332367897034, "learning_rate": 0.0002, "epoch": 3.1552587646076793, "step": 1890}, {"loss": 1.5219, "grad_norm": 0.5112161040306091, "learning_rate": 0.0002, "epoch": 3.1719532554257097, "step": 1900}, {"loss": 1.5139, "grad_norm": 0.6393680572509766, "learning_rate": 0.0002, "epoch": 3.1886477462437397, "step": 1910}, {"loss": 1.5337, "grad_norm": 0.7201815247535706, "learning_rate": 0.0002, "epoch": 3.2053422370617697, "step": 1920}, {"loss": 1.6055, "grad_norm": 0.5856018662452698, "learning_rate": 0.0002, "epoch": 3.2220367278797997, "step": 1930}, {"loss": 1.4791, "grad_norm": 0.581247866153717, "learning_rate": 0.0002, "epoch": 3.2387312186978297, "step": 1940}, {"loss": 1.5395, "grad_norm": 0.6055102944374084, "learning_rate": 0.0002, "epoch": 3.2554257095158596, "step": 1950}, {"loss": 1.5086, "grad_norm": 0.546894371509552, "learning_rate": 0.0002, "epoch": 3.27212020033389, "step": 1960}, {"loss": 1.5712, "grad_norm": 0.565558910369873, "learning_rate": 0.0002, "epoch": 3.28881469115192, "step": 1970}, {"loss": 1.47, "grad_norm": 1.2238883972167969, "learning_rate": 0.0002, "epoch": 3.30550918196995, "step": 1980}, {"loss": 1.4655, "grad_norm": 0.6362585425376892, "learning_rate": 0.0002, "epoch": 3.32220367278798, "step": 1990}, {"loss": 1.5157, "grad_norm": 0.6131124496459961, "learning_rate": 0.0002, "epoch": 3.33889816360601, "step": 2000}, {"loss": 1.5322, "grad_norm": 0.5181341767311096, "learning_rate": 0.0002, "epoch": 3.35559265442404, "step": 2010}, {"loss": 1.5039, "grad_norm": 0.6667609810829163, "learning_rate": 0.0002, "epoch": 3.37228714524207, "step": 2020}, {"loss": 1.5814, "grad_norm": 0.6488749980926514, "learning_rate": 0.0002, "epoch": 3.3889816360601, "step": 2030}, {"loss": 1.5226, "grad_norm": 0.5693286061286926, "learning_rate": 0.0002, "epoch": 3.4056761268781304, "step": 2040}, {"loss": 1.5121, "grad_norm": 0.6154143810272217, "learning_rate": 0.0002, "epoch": 3.4223706176961604, "step": 2050}, {"loss": 1.6033, "grad_norm": 0.6747981309890747, "learning_rate": 0.0002, "epoch": 3.4390651085141903, "step": 2060}, {"loss": 1.5857, "grad_norm": 0.5494789481163025, "learning_rate": 0.0002, "epoch": 3.4557595993322203, "step": 2070}, {"loss": 1.5223, "grad_norm": 2.481968402862549, "learning_rate": 0.0002, "epoch": 3.4724540901502503, "step": 2080}, {"loss": 1.4989, "grad_norm": 0.589784562587738, "learning_rate": 0.0002, "epoch": 3.4891485809682803, "step": 2090}, {"loss": 1.6227, "grad_norm": 0.6449820399284363, "learning_rate": 0.0002, "epoch": 3.5058430717863107, "step": 2100}, {"loss": 1.588, "grad_norm": 0.6467038989067078, "learning_rate": 0.0002, "epoch": 3.5225375626043407, "step": 2110}, {"loss": 1.5655, "grad_norm": 0.6533533334732056, "learning_rate": 0.0002, "epoch": 3.5392320534223707, "step": 2120}, {"loss": 1.6052, "grad_norm": 0.6804035902023315, "learning_rate": 0.0002, "epoch": 3.5559265442404007, "step": 2130}, {"loss": 1.5408, "grad_norm": 0.628773033618927, "learning_rate": 0.0002, "epoch": 3.5726210350584306, "step": 2140}, {"loss": 1.5487, "grad_norm": 0.6055739521980286, "learning_rate": 0.0002, "epoch": 3.5893155258764606, "step": 2150}, {"loss": 1.5305, "grad_norm": 0.6000894904136658, "learning_rate": 0.0002, "epoch": 3.6060100166944906, "step": 2160}, {"loss": 1.4742, "grad_norm": 0.5862473249435425, "learning_rate": 0.0002, "epoch": 3.6227045075125206, "step": 2170}, {"loss": 1.503, "grad_norm": 0.6547419428825378, "learning_rate": 0.0002, "epoch": 3.639398998330551, "step": 2180}, {"loss": 1.4704, "grad_norm": 0.5610318779945374, "learning_rate": 0.0002, "epoch": 3.656093489148581, "step": 2190}, {"loss": 1.4814, "grad_norm": 0.6387564539909363, "learning_rate": 0.0002, "epoch": 3.672787979966611, "step": 2200}, {"loss": 1.5356, "grad_norm": 0.6065090894699097, "learning_rate": 0.0002, "epoch": 3.689482470784641, "step": 2210}, {"loss": 1.5074, "grad_norm": 0.6266646981239319, "learning_rate": 0.0002, "epoch": 3.706176961602671, "step": 2220}, {"loss": 1.5146, "grad_norm": 0.626944363117218, "learning_rate": 0.0002, "epoch": 3.7228714524207014, "step": 2230}, {"loss": 1.5131, "grad_norm": 0.6043975949287415, "learning_rate": 0.0002, "epoch": 3.7395659432387314, "step": 2240}, {"loss": 1.5929, "grad_norm": 0.599732518196106, "learning_rate": 0.0002, "epoch": 3.7562604340567614, "step": 2250}, {"loss": 1.5236, "grad_norm": 0.6738389134407043, "learning_rate": 0.0002, "epoch": 3.7729549248747913, "step": 2260}, {"loss": 1.5003, "grad_norm": 0.5561335682868958, "learning_rate": 0.0002, "epoch": 3.7896494156928213, "step": 2270}, {"loss": 1.5013, "grad_norm": 0.6185726523399353, "learning_rate": 0.0002, "epoch": 3.8063439065108513, "step": 2280}, {"loss": 1.4996, "grad_norm": 0.6151532530784607, "learning_rate": 0.0002, "epoch": 3.8230383973288813, "step": 2290}, {"loss": 1.5453, "grad_norm": 0.5808233022689819, "learning_rate": 0.0002, "epoch": 3.8397328881469113, "step": 2300}, {"loss": 1.5223, "grad_norm": 0.6615163683891296, "learning_rate": 0.0002, "epoch": 3.8564273789649417, "step": 2310}, {"loss": 1.4365, "grad_norm": 0.5832979679107666, "learning_rate": 0.0002, "epoch": 3.8731218697829717, "step": 2320}, {"loss": 1.6036, "grad_norm": 0.6119300127029419, "learning_rate": 0.0002, "epoch": 3.8898163606010017, "step": 2330}, {"loss": 1.5581, "grad_norm": 0.6489697694778442, "learning_rate": 0.0002, "epoch": 3.9065108514190316, "step": 2340}, {"loss": 1.5601, "grad_norm": 0.5539063215255737, "learning_rate": 0.0002, "epoch": 3.9232053422370616, "step": 2350}, {"loss": 1.5174, "grad_norm": 0.6062877178192139, "learning_rate": 0.0002, "epoch": 3.939899833055092, "step": 2360}, {"loss": 1.5168, "grad_norm": 0.680609941482544, "learning_rate": 0.0002, "epoch": 3.956594323873122, "step": 2370}, {"loss": 1.4875, "grad_norm": 0.6176834106445312, "learning_rate": 0.0002, "epoch": 3.973288814691152, "step": 2380}, {"loss": 1.4984, "grad_norm": 0.6538102030754089, "learning_rate": 0.0002, "epoch": 3.989983305509182, "step": 2390}, {"eval_loss": 1.8920671939849854, "eval_runtime": 76.5227, "eval_samples_per_second": 6.73, "eval_steps_per_second": 0.849, "epoch": 4.0, "step": 2396}, {"loss": 1.3926, "grad_norm": 0.5683762431144714, "learning_rate": 0.0002, "epoch": 4.006677796327212, "step": 2400}, {"loss": 1.3387, "grad_norm": 0.6858044862747192, "learning_rate": 0.0002, "epoch": 4.023372287145242, "step": 2410}, {"loss": 1.4495, "grad_norm": 0.7614858150482178, "learning_rate": 0.0002, "epoch": 4.040066777963272, "step": 2420}, {"loss": 1.2696, "grad_norm": 0.709412693977356, "learning_rate": 0.0002, "epoch": 4.056761268781302, "step": 2430}, {"loss": 1.3836, "grad_norm": 0.7070785760879517, "learning_rate": 0.0002, "epoch": 4.073455759599332, "step": 2440}, {"loss": 1.3527, "grad_norm": 0.8815216422080994, "learning_rate": 0.0002, "epoch": 4.090150250417362, "step": 2450}, {"loss": 1.3731, "grad_norm": 0.759981632232666, "learning_rate": 0.0002, "epoch": 4.106844741235392, "step": 2460}, {"loss": 1.3393, "grad_norm": 0.6715240478515625, "learning_rate": 0.0002, "epoch": 4.123539232053423, "step": 2470}, {"loss": 1.3934, "grad_norm": 0.7503564953804016, "learning_rate": 0.0002, "epoch": 4.140233722871453, "step": 2480}, {"loss": 1.324, "grad_norm": 0.773743748664856, "learning_rate": 0.0002, "epoch": 4.156928213689483, "step": 2490}, {"loss": 1.3782, "grad_norm": 0.8850100040435791, "learning_rate": 0.0002, "epoch": 4.173622704507513, "step": 2500}, {"loss": 1.3183, "grad_norm": 0.7575962543487549, "learning_rate": 0.0002, "epoch": 4.190317195325543, "step": 2510}, {"loss": 1.3673, "grad_norm": 0.9117498397827148, "learning_rate": 0.0002, "epoch": 4.207011686143573, "step": 2520}, {"loss": 1.3242, "grad_norm": 0.7637559175491333, "learning_rate": 0.0002, "epoch": 4.223706176961603, "step": 2530}, {"loss": 1.3764, "grad_norm": 0.8178390264511108, "learning_rate": 0.0002, "epoch": 4.240400667779633, "step": 2540}, {"loss": 1.3808, "grad_norm": 0.8299263119697571, "learning_rate": 0.0002, "epoch": 4.257095158597663, "step": 2550}, {"loss": 1.3637, "grad_norm": 0.7238091230392456, "learning_rate": 0.0002, "epoch": 4.273789649415693, "step": 2560}, {"loss": 1.349, "grad_norm": 0.7468036413192749, "learning_rate": 0.0002, "epoch": 4.290484140233723, "step": 2570}, {"loss": 1.4422, "grad_norm": 0.8012791275978088, "learning_rate": 0.0002, "epoch": 4.307178631051753, "step": 2580}, {"loss": 1.3723, "grad_norm": 0.8302484154701233, "learning_rate": 0.0002, "epoch": 4.323873121869783, "step": 2590}, {"loss": 1.4013, "grad_norm": 0.751864492893219, "learning_rate": 0.0002, "epoch": 4.340567612687813, "step": 2600}, {"loss": 1.3881, "grad_norm": 0.8025410175323486, "learning_rate": 0.0002, "epoch": 4.357262103505843, "step": 2610}, {"loss": 1.3831, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 4.373956594323873, "step": 2620}, {"loss": 1.3721, "grad_norm": 0.8526890873908997, "learning_rate": 0.0002, "epoch": 4.390651085141903, "step": 2630}, {"loss": 1.4253, "grad_norm": 1.0536625385284424, "learning_rate": 0.0002, "epoch": 4.407345575959933, "step": 2640}, {"loss": 1.3736, "grad_norm": 0.7223818898200989, "learning_rate": 0.0002, "epoch": 4.424040066777963, "step": 2650}, {"loss": 1.4652, "grad_norm": 0.7981253266334534, "learning_rate": 0.0002, "epoch": 4.440734557595993, "step": 2660}, {"loss": 1.3878, "grad_norm": 0.7136162519454956, "learning_rate": 0.0002, "epoch": 4.457429048414023, "step": 2670}, {"loss": 1.4242, "grad_norm": 0.8008312582969666, "learning_rate": 0.0002, "epoch": 4.474123539232053, "step": 2680}, {"loss": 1.3448, "grad_norm": 0.7924065589904785, "learning_rate": 0.0002, "epoch": 4.490818030050083, "step": 2690}, {"loss": 1.402, "grad_norm": 0.8224287629127502, "learning_rate": 0.0002, "epoch": 4.507512520868113, "step": 2700}, {"loss": 1.2841, "grad_norm": 0.7494375109672546, "learning_rate": 0.0002, "epoch": 4.524207011686143, "step": 2710}, {"loss": 1.4471, "grad_norm": 0.8097899556159973, "learning_rate": 0.0002, "epoch": 4.540901502504173, "step": 2720}, {"loss": 1.4116, "grad_norm": 0.7728819251060486, "learning_rate": 0.0002, "epoch": 4.557595993322204, "step": 2730}, {"loss": 1.3549, "grad_norm": 0.9112362265586853, "learning_rate": 0.0002, "epoch": 4.574290484140234, "step": 2740}, {"loss": 1.4601, "grad_norm": 0.7502672076225281, "learning_rate": 0.0002, "epoch": 4.590984974958264, "step": 2750}, {"loss": 1.4216, "grad_norm": 0.8816406726837158, "learning_rate": 0.0002, "epoch": 4.607679465776294, "step": 2760}, {"loss": 1.3233, "grad_norm": 0.7117180228233337, "learning_rate": 0.0002, "epoch": 4.624373956594324, "step": 2770}, {"loss": 1.3886, "grad_norm": 0.8224529027938843, "learning_rate": 0.0002, "epoch": 4.641068447412354, "step": 2780}, {"loss": 1.3756, "grad_norm": 0.7625266313552856, "learning_rate": 0.0002, "epoch": 4.657762938230384, "step": 2790}, {"loss": 1.3953, "grad_norm": 0.7754318118095398, "learning_rate": 0.0002, "epoch": 4.674457429048414, "step": 2800}, {"loss": 1.4102, "grad_norm": 0.7907336354255676, "learning_rate": 0.0002, "epoch": 4.691151919866444, "step": 2810}, {"loss": 1.3277, "grad_norm": 0.7377734780311584, "learning_rate": 0.0002, "epoch": 4.707846410684474, "step": 2820}, {"loss": 1.3686, "grad_norm": 0.7380456328392029, "learning_rate": 0.0002, "epoch": 4.724540901502504, "step": 2830}, {"loss": 1.4405, "grad_norm": 0.7148023247718811, "learning_rate": 0.0002, "epoch": 4.741235392320534, "step": 2840}, {"loss": 1.4025, "grad_norm": 0.807048499584198, "learning_rate": 0.0002, "epoch": 4.757929883138564, "step": 2850}, {"loss": 1.3195, "grad_norm": 0.8444154858589172, "learning_rate": 0.0002, "epoch": 4.774624373956595, "step": 2860}, {"loss": 1.4282, "grad_norm": 0.8328704237937927, "learning_rate": 0.0002, "epoch": 4.791318864774624, "step": 2870}, {"loss": 1.413, "grad_norm": 0.89827960729599, "learning_rate": 0.0002, "epoch": 4.808013355592655, "step": 2880}, {"loss": 1.4488, "grad_norm": 0.7848225831985474, "learning_rate": 0.0002, "epoch": 4.824707846410685, "step": 2890}, {"loss": 1.3757, "grad_norm": 0.703802227973938, "learning_rate": 0.0002, "epoch": 4.841402337228715, "step": 2900}, {"loss": 1.4404, "grad_norm": 0.8092581629753113, "learning_rate": 0.0002, "epoch": 4.858096828046745, "step": 2910}, {"loss": 1.3812, "grad_norm": 0.7537722587585449, "learning_rate": 0.0002, "epoch": 4.874791318864775, "step": 2920}, {"loss": 1.4499, "grad_norm": 0.7966470122337341, "learning_rate": 0.0002, "epoch": 4.891485809682805, "step": 2930}, {"loss": 1.3922, "grad_norm": 0.7860329747200012, "learning_rate": 0.0002, "epoch": 4.908180300500835, "step": 2940}, {"loss": 1.4224, "grad_norm": 0.7964439988136292, "learning_rate": 0.0002, "epoch": 4.924874791318865, "step": 2950}, {"loss": 1.3869, "grad_norm": 0.740288257598877, "learning_rate": 0.0002, "epoch": 4.941569282136895, "step": 2960}, {"loss": 1.4321, "grad_norm": 0.7377685904502869, "learning_rate": 0.0002, "epoch": 4.958263772954925, "step": 2970}, {"loss": 1.4253, "grad_norm": 0.793484628200531, "learning_rate": 0.0002, "epoch": 4.974958263772955, "step": 2980}, {"loss": 1.3966, "grad_norm": 0.7710573077201843, "learning_rate": 0.0002, "epoch": 4.9916527545909855, "step": 2990}, {"eval_loss": 1.9764225482940674, "eval_runtime": 87.968, "eval_samples_per_second": 5.854, "eval_steps_per_second": 0.739, "epoch": 5.0, "step": 2995}, {"loss": 1.3493, "grad_norm": 0.680841326713562, "learning_rate": 0.0002, "epoch": 5.008347245409015, "step": 3000}, {"loss": 1.2462, "grad_norm": 0.8790825009346008, "learning_rate": 0.0002, "epoch": 5.025041736227045, "step": 3010}, {"loss": 1.2514, "grad_norm": 1.1519404649734497, "learning_rate": 0.0002, "epoch": 5.041736227045075, "step": 3020}, {"loss": 1.224, "grad_norm": 1.1939337253570557, "learning_rate": 0.0002, "epoch": 5.058430717863105, "step": 3030}, {"loss": 1.1274, "grad_norm": 1.1471049785614014, "learning_rate": 0.0002, "epoch": 5.075125208681135, "step": 3040}, {"loss": 1.1726, "grad_norm": 1.0808285474777222, "learning_rate": 0.0002, "epoch": 5.091819699499165, "step": 3050}, {"loss": 1.1644, "grad_norm": 1.0102492570877075, "learning_rate": 0.0002, "epoch": 5.108514190317195, "step": 3060}, {"loss": 1.1652, "grad_norm": 0.9869397282600403, "learning_rate": 0.0002, "epoch": 5.125208681135225, "step": 3070}, {"loss": 1.1997, "grad_norm": 0.9689525365829468, "learning_rate": 0.0002, "epoch": 5.141903171953255, "step": 3080}, {"loss": 1.1747, "grad_norm": 0.9293769598007202, "learning_rate": 0.0002, "epoch": 5.158597662771285, "step": 3090}, {"loss": 1.1728, "grad_norm": 0.9289103150367737, "learning_rate": 0.0002, "epoch": 5.175292153589315, "step": 3100}, {"loss": 1.2538, "grad_norm": 0.9736173152923584, "learning_rate": 0.0002, "epoch": 5.191986644407345, "step": 3110}, {"loss": 1.2429, "grad_norm": 1.3144289255142212, "learning_rate": 0.0002, "epoch": 5.208681135225375, "step": 3120}, {"loss": 1.2107, "grad_norm": 0.95982825756073, "learning_rate": 0.0002, "epoch": 5.225375626043405, "step": 3130}, {"loss": 1.2239, "grad_norm": 0.903189480304718, "learning_rate": 0.0002, "epoch": 5.242070116861436, "step": 3140}, {"loss": 1.2663, "grad_norm": 1.056692123413086, "learning_rate": 0.0002, "epoch": 5.258764607679466, "step": 3150}, {"loss": 1.2955, "grad_norm": 1.1169359683990479, "learning_rate": 0.0002, "epoch": 5.275459098497496, "step": 3160}, {"loss": 1.1559, "grad_norm": 1.2178374528884888, "learning_rate": 0.0002, "epoch": 5.292153589315526, "step": 3170}, {"loss": 1.2394, "grad_norm": 0.9956373572349548, "learning_rate": 0.0002, "epoch": 5.308848080133556, "step": 3180}, {"loss": 1.1792, "grad_norm": 0.959555447101593, "learning_rate": 0.0002, "epoch": 5.325542570951586, "step": 3190}, {"loss": 1.1817, "grad_norm": 0.9343846440315247, "learning_rate": 0.0002, "epoch": 5.342237061769616, "step": 3200}, {"loss": 1.2033, "grad_norm": 0.8806524872779846, "learning_rate": 0.0002, "epoch": 5.358931552587646, "step": 3210}, {"loss": 1.2511, "grad_norm": 0.9477803111076355, "learning_rate": 0.0002, "epoch": 5.375626043405676, "step": 3220}, {"loss": 1.2011, "grad_norm": 0.9975674152374268, "learning_rate": 0.0002, "epoch": 5.392320534223706, "step": 3230}, {"loss": 1.3012, "grad_norm": 0.9650071263313293, "learning_rate": 0.0002, "epoch": 5.409015025041736, "step": 3240}, {"loss": 1.2281, "grad_norm": 1.0170838832855225, "learning_rate": 0.0002, "epoch": 5.425709515859766, "step": 3250}, {"loss": 1.2635, "grad_norm": 1.158118486404419, "learning_rate": 0.0002, "epoch": 5.442404006677796, "step": 3260}, {"loss": 1.3333, "grad_norm": 1.0228497982025146, "learning_rate": 0.0002, "epoch": 5.459098497495827, "step": 3270}, {"loss": 1.1961, "grad_norm": 1.0101768970489502, "learning_rate": 0.0002, "epoch": 5.475792988313857, "step": 3280}, {"loss": 1.3058, "grad_norm": 1.0407295227050781, "learning_rate": 0.0002, "epoch": 5.492487479131887, "step": 3290}, {"loss": 1.2062, "grad_norm": 0.9337932467460632, "learning_rate": 0.0002, "epoch": 5.509181969949917, "step": 3300}, {"loss": 1.2241, "grad_norm": 1.0305527448654175, "learning_rate": 0.0002, "epoch": 5.525876460767947, "step": 3310}, {"loss": 1.2524, "grad_norm": 1.0523453950881958, "learning_rate": 0.0002, "epoch": 5.542570951585977, "step": 3320}, {"loss": 1.2526, "grad_norm": 0.9707391858100891, "learning_rate": 0.0002, "epoch": 5.559265442404007, "step": 3330}, {"loss": 1.3002, "grad_norm": 1.0054972171783447, "learning_rate": 0.0002, "epoch": 5.575959933222037, "step": 3340}, {"loss": 1.2459, "grad_norm": 1.0393340587615967, "learning_rate": 0.0002, "epoch": 5.592654424040067, "step": 3350}, {"loss": 1.2328, "grad_norm": 1.0671277046203613, "learning_rate": 0.0002, "epoch": 5.609348914858097, "step": 3360}, {"loss": 1.2415, "grad_norm": 1.0725873708724976, "learning_rate": 0.0002, "epoch": 5.626043405676127, "step": 3370}, {"loss": 1.2475, "grad_norm": 0.9844746589660645, "learning_rate": 0.0002, "epoch": 5.642737896494157, "step": 3380}, {"loss": 1.1997, "grad_norm": 0.9659736752510071, "learning_rate": 0.0002, "epoch": 5.659432387312187, "step": 3390}, {"loss": 1.2426, "grad_norm": 0.9152608513832092, "learning_rate": 0.0002, "epoch": 5.676126878130217, "step": 3400}, {"loss": 1.2424, "grad_norm": 0.9759509563446045, "learning_rate": 0.0002, "epoch": 5.692821368948247, "step": 3410}, {"loss": 1.2264, "grad_norm": 1.0662057399749756, "learning_rate": 0.0002, "epoch": 5.709515859766277, "step": 3420}, {"loss": 1.19, "grad_norm": 0.9780185222625732, "learning_rate": 0.0002, "epoch": 5.726210350584307, "step": 3430}, {"loss": 1.2603, "grad_norm": 0.9781617522239685, "learning_rate": 0.0002, "epoch": 5.742904841402337, "step": 3440}, {"loss": 1.2472, "grad_norm": 1.0790785551071167, "learning_rate": 0.0002, "epoch": 5.759599332220367, "step": 3450}, {"loss": 1.2697, "grad_norm": 1.0573410987854004, "learning_rate": 0.0002, "epoch": 5.776293823038397, "step": 3460}, {"loss": 1.2591, "grad_norm": 0.9953364729881287, "learning_rate": 0.0002, "epoch": 5.792988313856427, "step": 3470}, {"loss": 1.2361, "grad_norm": 1.0072667598724365, "learning_rate": 0.0002, "epoch": 5.809682804674457, "step": 3480}, {"loss": 1.286, "grad_norm": 0.9312750697135925, "learning_rate": 0.0002, "epoch": 5.826377295492487, "step": 3490}, {"loss": 1.2379, "grad_norm": 1.059614896774292, "learning_rate": 0.0002, "epoch": 5.843071786310517, "step": 3500}, {"loss": 1.2323, "grad_norm": 1.2089484930038452, "learning_rate": 0.0002, "epoch": 5.859766277128547, "step": 3510}, {"loss": 1.2047, "grad_norm": 1.0740607976913452, "learning_rate": 0.0002, "epoch": 5.876460767946577, "step": 3520}, {"loss": 1.2809, "grad_norm": 0.9620149731636047, "learning_rate": 0.0002, "epoch": 5.893155258764608, "step": 3530}, {"loss": 1.238, "grad_norm": 1.0482431650161743, "learning_rate": 0.0002, "epoch": 5.909849749582638, "step": 3540}, {"loss": 1.2621, "grad_norm": 0.9137503504753113, "learning_rate": 0.0002, "epoch": 5.926544240400668, "step": 3550}, {"loss": 1.3066, "grad_norm": 1.1599403619766235, "learning_rate": 0.0002, "epoch": 5.943238731218698, "step": 3560}, {"loss": 1.2556, "grad_norm": 0.911613404750824, "learning_rate": 0.0002, "epoch": 5.959933222036728, "step": 3570}, {"loss": 1.2746, "grad_norm": 0.9120033383369446, "learning_rate": 0.0002, "epoch": 5.976627712854758, "step": 3580}, {"loss": 1.2815, "grad_norm": 1.0588736534118652, "learning_rate": 0.0002, "epoch": 5.993322203672788, "step": 3590}]} +{"epoch": 7.0, "step": 4193, "epoch_duration": 1532.7661001682281, "total_accumulated_duration": 10360.64090514183, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}, {"eval_loss": 1.8236571550369263, "eval_runtime": 77.157, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.842, "epoch": 1.0, "step": 599}, {"loss": 1.7724, "grad_norm": 0.6550514698028564, "learning_rate": 0.0002, "epoch": 1.001669449081803, "step": 600}, {"loss": 1.7601, "grad_norm": 0.36824166774749756, "learning_rate": 0.0002, "epoch": 1.018363939899833, "step": 610}, {"loss": 1.7684, "grad_norm": 0.34707099199295044, "learning_rate": 0.0002, "epoch": 1.0350584307178632, "step": 620}, {"loss": 1.7552, "grad_norm": 0.38599663972854614, "learning_rate": 0.0002, "epoch": 1.0517529215358932, "step": 630}, {"loss": 1.7396, "grad_norm": 0.34381693601608276, "learning_rate": 0.0002, "epoch": 1.0684474123539232, "step": 640}, {"loss": 1.7621, "grad_norm": 0.3657481372356415, "learning_rate": 0.0002, "epoch": 1.0851419031719534, "step": 650}, {"loss": 1.723, "grad_norm": 0.3310803771018982, "learning_rate": 0.0002, "epoch": 1.1018363939899833, "step": 660}, {"loss": 1.7788, "grad_norm": 0.37122875452041626, "learning_rate": 0.0002, "epoch": 1.1185308848080133, "step": 670}, {"loss": 1.6887, "grad_norm": 0.3976633548736572, "learning_rate": 0.0002, "epoch": 1.1352253756260433, "step": 680}, {"loss": 1.7398, "grad_norm": 0.37567615509033203, "learning_rate": 0.0002, "epoch": 1.1519198664440735, "step": 690}, {"loss": 1.7326, "grad_norm": 0.3683645725250244, "learning_rate": 0.0002, "epoch": 1.1686143572621035, "step": 700}, {"loss": 1.8074, "grad_norm": 0.3862009644508362, "learning_rate": 0.0002, "epoch": 1.1853088480801335, "step": 710}, {"loss": 1.7993, "grad_norm": 0.3478439450263977, "learning_rate": 0.0002, "epoch": 1.2020033388981637, "step": 720}, {"loss": 1.7945, "grad_norm": 0.3694932162761688, "learning_rate": 0.0002, "epoch": 1.2186978297161937, "step": 730}, {"loss": 1.7269, "grad_norm": 0.3661787211894989, "learning_rate": 0.0002, "epoch": 1.2353923205342237, "step": 740}, {"loss": 1.7561, "grad_norm": 0.372951865196228, "learning_rate": 0.0002, "epoch": 1.2520868113522536, "step": 750}, {"loss": 1.7507, "grad_norm": 0.38718998432159424, "learning_rate": 0.0002, "epoch": 1.2687813021702838, "step": 760}, {"loss": 1.7331, "grad_norm": 0.37488260865211487, "learning_rate": 0.0002, "epoch": 1.2854757929883138, "step": 770}, {"loss": 1.7548, "grad_norm": 0.34794917702674866, "learning_rate": 0.0002, "epoch": 1.302170283806344, "step": 780}, {"loss": 1.7592, "grad_norm": 0.3627476990222931, "learning_rate": 0.0002, "epoch": 1.318864774624374, "step": 790}, {"loss": 1.6837, "grad_norm": 0.3773096799850464, "learning_rate": 0.0002, "epoch": 1.335559265442404, "step": 800}, {"loss": 1.7448, "grad_norm": 0.36476725339889526, "learning_rate": 0.0002, "epoch": 1.352253756260434, "step": 810}, {"loss": 1.7958, "grad_norm": 0.3767942190170288, "learning_rate": 0.0002, "epoch": 1.3689482470784642, "step": 820}, {"loss": 1.7241, "grad_norm": 0.3502795398235321, "learning_rate": 0.0002, "epoch": 1.3856427378964942, "step": 830}, {"loss": 1.736, "grad_norm": 0.4008622169494629, "learning_rate": 0.0002, "epoch": 1.4023372287145242, "step": 840}, {"loss": 1.7211, "grad_norm": 0.4029707610607147, "learning_rate": 0.0002, "epoch": 1.4190317195325544, "step": 850}, {"loss": 1.712, "grad_norm": 0.41480565071105957, "learning_rate": 0.0002, "epoch": 1.4357262103505843, "step": 860}, {"loss": 1.7287, "grad_norm": 0.4351646900177002, "learning_rate": 0.0002, "epoch": 1.4524207011686143, "step": 870}, {"loss": 1.8436, "grad_norm": 0.4053232967853546, "learning_rate": 0.0002, "epoch": 1.4691151919866443, "step": 880}, {"loss": 1.7669, "grad_norm": 0.3515186607837677, "learning_rate": 0.0002, "epoch": 1.4858096828046745, "step": 890}, {"loss": 1.698, "grad_norm": 0.42895469069480896, "learning_rate": 0.0002, "epoch": 1.5025041736227045, "step": 900}, {"loss": 1.7064, "grad_norm": 0.40897831320762634, "learning_rate": 0.0002, "epoch": 1.5191986644407347, "step": 910}, {"loss": 1.7539, "grad_norm": 0.3544739782810211, "learning_rate": 0.0002, "epoch": 1.5358931552587647, "step": 920}, {"loss": 1.7596, "grad_norm": 0.3848305642604828, "learning_rate": 0.0002, "epoch": 1.5525876460767947, "step": 930}, {"loss": 1.7344, "grad_norm": 0.36952173709869385, "learning_rate": 0.0002, "epoch": 1.5692821368948247, "step": 940}, {"loss": 1.7329, "grad_norm": 0.36505743861198425, "learning_rate": 0.0002, "epoch": 1.5859766277128546, "step": 950}, {"loss": 1.6828, "grad_norm": 0.3707764446735382, "learning_rate": 0.0002, "epoch": 1.6026711185308848, "step": 960}, {"loss": 1.7465, "grad_norm": 0.35995468497276306, "learning_rate": 0.0002, "epoch": 1.6193656093489148, "step": 970}, {"loss": 1.7537, "grad_norm": 0.35458096861839294, "learning_rate": 0.0002, "epoch": 1.636060100166945, "step": 980}, {"loss": 1.7075, "grad_norm": 0.3557756841182709, "learning_rate": 0.0002, "epoch": 1.652754590984975, "step": 990}, {"loss": 1.7307, "grad_norm": 0.355899453163147, "learning_rate": 0.0002, "epoch": 1.669449081803005, "step": 1000}, {"loss": 1.7701, "grad_norm": 0.3709148168563843, "learning_rate": 0.0002, "epoch": 1.686143572621035, "step": 1010}, {"loss": 1.7637, "grad_norm": 0.3731614947319031, "learning_rate": 0.0002, "epoch": 1.702838063439065, "step": 1020}, {"loss": 1.741, "grad_norm": 0.3639261722564697, "learning_rate": 0.0002, "epoch": 1.7195325542570952, "step": 1030}, {"loss": 1.727, "grad_norm": 0.36371079087257385, "learning_rate": 0.0002, "epoch": 1.7362270450751254, "step": 1040}, {"loss": 1.7275, "grad_norm": 0.38235539197921753, "learning_rate": 0.0002, "epoch": 1.7529215358931554, "step": 1050}, {"loss": 1.7304, "grad_norm": 0.4109364151954651, "learning_rate": 0.0002, "epoch": 1.7696160267111853, "step": 1060}, {"loss": 1.7244, "grad_norm": 0.3499647378921509, "learning_rate": 0.0002, "epoch": 1.7863105175292153, "step": 1070}, {"loss": 1.7117, "grad_norm": 0.3892260193824768, "learning_rate": 0.0002, "epoch": 1.8030050083472453, "step": 1080}, {"loss": 1.7764, "grad_norm": 0.3545094132423401, "learning_rate": 0.0002, "epoch": 1.8196994991652755, "step": 1090}, {"loss": 1.87, "grad_norm": 0.40419837832450867, "learning_rate": 0.0002, "epoch": 1.8363939899833055, "step": 1100}, {"loss": 1.6783, "grad_norm": 0.38423678278923035, "learning_rate": 0.0002, "epoch": 1.8530884808013357, "step": 1110}, {"loss": 1.7714, "grad_norm": 0.378408282995224, "learning_rate": 0.0002, "epoch": 1.8697829716193657, "step": 1120}, {"loss": 1.7461, "grad_norm": 0.4071941077709198, "learning_rate": 0.0002, "epoch": 1.8864774624373957, "step": 1130}, {"loss": 1.7281, "grad_norm": 0.42363739013671875, "learning_rate": 0.0002, "epoch": 1.9031719532554257, "step": 1140}, {"loss": 1.819, "grad_norm": 0.37373560667037964, "learning_rate": 0.0002, "epoch": 1.9198664440734556, "step": 1150}, {"loss": 1.7847, "grad_norm": 0.36408767104148865, "learning_rate": 0.0002, "epoch": 1.9365609348914858, "step": 1160}, {"loss": 1.6725, "grad_norm": 0.3795453906059265, "learning_rate": 0.0002, "epoch": 1.9532554257095158, "step": 1170}, {"loss": 1.7726, "grad_norm": 0.34415504336357117, "learning_rate": 0.0002, "epoch": 1.969949916527546, "step": 1180}, {"loss": 1.6199, "grad_norm": 0.3491021394729614, "learning_rate": 0.0002, "epoch": 1.986644407345576, "step": 1190}, {"eval_loss": 1.8182536363601685, "eval_runtime": 87.8767, "eval_samples_per_second": 5.86, "eval_steps_per_second": 0.74, "epoch": 2.0, "step": 1198}, {"loss": 1.5857, "grad_norm": 0.36758512258529663, "learning_rate": 0.0002, "epoch": 2.003338898163606, "step": 1200}, {"loss": 1.61, "grad_norm": 0.36278557777404785, "learning_rate": 0.0002, "epoch": 2.020033388981636, "step": 1210}, {"loss": 1.6733, "grad_norm": 0.4186977744102478, "learning_rate": 0.0002, "epoch": 2.036727879799666, "step": 1220}, {"loss": 1.6977, "grad_norm": 0.3958706855773926, "learning_rate": 0.0002, "epoch": 2.053422370617696, "step": 1230}, {"loss": 1.7054, "grad_norm": 0.43305638432502747, "learning_rate": 0.0002, "epoch": 2.0701168614357264, "step": 1240}, {"loss": 1.6275, "grad_norm": 0.4509678781032562, "learning_rate": 0.0002, "epoch": 2.0868113522537564, "step": 1250}, {"loss": 1.6814, "grad_norm": 0.4297264516353607, "learning_rate": 0.0002, "epoch": 2.1035058430717863, "step": 1260}, {"loss": 1.6121, "grad_norm": 0.4579504132270813, "learning_rate": 0.0002, "epoch": 2.1202003338898163, "step": 1270}, {"loss": 1.6207, "grad_norm": 0.4223267138004303, "learning_rate": 0.0002, "epoch": 2.1368948247078463, "step": 1280}, {"loss": 1.6472, "grad_norm": 0.41538703441619873, "learning_rate": 0.0002, "epoch": 2.1535893155258763, "step": 1290}, {"loss": 1.6274, "grad_norm": 0.4987374544143677, "learning_rate": 0.0002, "epoch": 2.1702838063439067, "step": 1300}, {"loss": 1.6555, "grad_norm": 0.45300114154815674, "learning_rate": 0.0002, "epoch": 2.1869782971619367, "step": 1310}, {"loss": 1.6486, "grad_norm": 0.4577588737010956, "learning_rate": 0.0002, "epoch": 2.2036727879799667, "step": 1320}, {"loss": 1.644, "grad_norm": 0.4110747277736664, "learning_rate": 0.0002, "epoch": 2.2203672787979967, "step": 1330}, {"loss": 1.5875, "grad_norm": 0.5107163190841675, "learning_rate": 0.0002, "epoch": 2.2370617696160267, "step": 1340}, {"loss": 1.691, "grad_norm": 0.41190820932388306, "learning_rate": 0.0002, "epoch": 2.2537562604340566, "step": 1350}, {"loss": 1.5745, "grad_norm": 0.47458386421203613, "learning_rate": 0.0002, "epoch": 2.2704507512520866, "step": 1360}, {"loss": 1.6964, "grad_norm": 0.42136940360069275, "learning_rate": 0.0002, "epoch": 2.287145242070117, "step": 1370}, {"loss": 1.657, "grad_norm": 0.48292383551597595, "learning_rate": 0.0002, "epoch": 2.303839732888147, "step": 1380}, {"loss": 1.6816, "grad_norm": 0.4519229531288147, "learning_rate": 0.0002, "epoch": 2.320534223706177, "step": 1390}, {"loss": 1.6408, "grad_norm": 0.5860922336578369, "learning_rate": 0.0002, "epoch": 2.337228714524207, "step": 1400}, {"loss": 1.5551, "grad_norm": 0.4362313747406006, "learning_rate": 0.0002, "epoch": 2.353923205342237, "step": 1410}, {"loss": 1.6763, "grad_norm": 0.46916621923446655, "learning_rate": 0.0002, "epoch": 2.370617696160267, "step": 1420}, {"loss": 1.6082, "grad_norm": 0.5249663591384888, "learning_rate": 0.0002, "epoch": 2.3873121869782974, "step": 1430}, {"loss": 1.6793, "grad_norm": 0.4764375388622284, "learning_rate": 0.0002, "epoch": 2.4040066777963274, "step": 1440}, {"loss": 1.6395, "grad_norm": 0.46573784947395325, "learning_rate": 0.0002, "epoch": 2.4207011686143574, "step": 1450}, {"loss": 1.6629, "grad_norm": 0.44539371132850647, "learning_rate": 0.0002, "epoch": 2.4373956594323873, "step": 1460}, {"loss": 1.6149, "grad_norm": 0.40925896167755127, "learning_rate": 0.0002, "epoch": 2.4540901502504173, "step": 1470}, {"loss": 1.6213, "grad_norm": 0.4431462287902832, "learning_rate": 0.0002, "epoch": 2.4707846410684473, "step": 1480}, {"loss": 1.692, "grad_norm": 0.5476022362709045, "learning_rate": 0.0002, "epoch": 2.4874791318864773, "step": 1490}, {"loss": 1.5875, "grad_norm": 0.44762539863586426, "learning_rate": 0.0002, "epoch": 2.5041736227045073, "step": 1500}, {"loss": 1.6811, "grad_norm": 0.5470041632652283, "learning_rate": 0.0002, "epoch": 2.5208681135225377, "step": 1510}, {"loss": 1.7411, "grad_norm": 0.4739997088909149, "learning_rate": 0.0002, "epoch": 2.5375626043405677, "step": 1520}, {"loss": 1.5975, "grad_norm": 0.47115322947502136, "learning_rate": 0.0002, "epoch": 2.5542570951585977, "step": 1530}, {"loss": 1.6828, "grad_norm": 0.49705708026885986, "learning_rate": 0.0002, "epoch": 2.5709515859766277, "step": 1540}, {"loss": 1.7135, "grad_norm": 0.5537301301956177, "learning_rate": 0.0002, "epoch": 2.5876460767946576, "step": 1550}, {"loss": 1.5753, "grad_norm": 0.46930626034736633, "learning_rate": 0.0002, "epoch": 2.604340567612688, "step": 1560}, {"loss": 1.6706, "grad_norm": 0.42371469736099243, "learning_rate": 0.0002, "epoch": 2.621035058430718, "step": 1570}, {"loss": 1.6193, "grad_norm": 0.49005603790283203, "learning_rate": 0.0002, "epoch": 2.637729549248748, "step": 1580}, {"loss": 1.7138, "grad_norm": 0.4646829068660736, "learning_rate": 0.0002, "epoch": 2.654424040066778, "step": 1590}, {"loss": 1.6806, "grad_norm": 0.5091238617897034, "learning_rate": 0.0002, "epoch": 2.671118530884808, "step": 1600}, {"loss": 1.65, "grad_norm": 0.4889985918998718, "learning_rate": 0.0002, "epoch": 2.687813021702838, "step": 1610}, {"loss": 1.6573, "grad_norm": 0.5128234624862671, "learning_rate": 0.0002, "epoch": 2.704507512520868, "step": 1620}, {"loss": 1.6739, "grad_norm": 0.46999186277389526, "learning_rate": 0.0002, "epoch": 2.721202003338898, "step": 1630}, {"loss": 1.6277, "grad_norm": 0.4949921667575836, "learning_rate": 0.0002, "epoch": 2.7378964941569284, "step": 1640}, {"loss": 1.5958, "grad_norm": 0.4484370946884155, "learning_rate": 0.0002, "epoch": 2.7545909849749584, "step": 1650}, {"loss": 1.6156, "grad_norm": 0.45599570870399475, "learning_rate": 0.0002, "epoch": 2.7712854757929883, "step": 1660}, {"loss": 1.6037, "grad_norm": 0.5093285441398621, "learning_rate": 0.0002, "epoch": 2.7879799666110183, "step": 1670}, {"loss": 1.6675, "grad_norm": 0.44737935066223145, "learning_rate": 0.0002, "epoch": 2.8046744574290483, "step": 1680}, {"loss": 1.6429, "grad_norm": 0.4374251365661621, "learning_rate": 0.0002, "epoch": 2.8213689482470787, "step": 1690}, {"loss": 1.5941, "grad_norm": 0.44765740633010864, "learning_rate": 0.0002, "epoch": 2.8380634390651087, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.44685253500938416, "learning_rate": 0.0002, "epoch": 2.8547579298831387, "step": 1710}, {"loss": 1.6301, "grad_norm": 0.44777143001556396, "learning_rate": 0.0002, "epoch": 2.8714524207011687, "step": 1720}, {"loss": 1.6473, "grad_norm": 0.4178132712841034, "learning_rate": 0.0002, "epoch": 2.8881469115191987, "step": 1730}, {"loss": 1.6491, "grad_norm": 0.4487852156162262, "learning_rate": 0.0002, "epoch": 2.9048414023372287, "step": 1740}, {"loss": 1.6616, "grad_norm": 0.47137337923049927, "learning_rate": 0.0002, "epoch": 2.9215358931552586, "step": 1750}, {"loss": 1.6767, "grad_norm": 0.48543235659599304, "learning_rate": 0.0002, "epoch": 2.9382303839732886, "step": 1760}, {"loss": 1.7055, "grad_norm": 0.4174182116985321, "learning_rate": 0.0002, "epoch": 2.9549248747913186, "step": 1770}, {"loss": 1.5767, "grad_norm": 0.43385711312294006, "learning_rate": 0.0002, "epoch": 2.971619365609349, "step": 1780}, {"loss": 1.6215, "grad_norm": 0.474332332611084, "learning_rate": 0.0002, "epoch": 2.988313856427379, "step": 1790}, {"eval_loss": 1.8456445932388306, "eval_runtime": 87.6261, "eval_samples_per_second": 5.877, "eval_steps_per_second": 0.742, "epoch": 3.0, "step": 1797}, {"loss": 1.6224, "grad_norm": 0.40323764085769653, "learning_rate": 0.0002, "epoch": 3.005008347245409, "step": 1800}, {"loss": 1.5367, "grad_norm": 0.45069044828414917, "learning_rate": 0.0002, "epoch": 3.021702838063439, "step": 1810}, {"loss": 1.5271, "grad_norm": 0.6204925775527954, "learning_rate": 0.0002, "epoch": 3.038397328881469, "step": 1820}, {"loss": 1.5056, "grad_norm": 0.5857783555984497, "learning_rate": 0.0002, "epoch": 3.0550918196994994, "step": 1830}, {"loss": 1.5137, "grad_norm": 0.6776524782180786, "learning_rate": 0.0002, "epoch": 3.0717863105175294, "step": 1840}, {"loss": 1.5106, "grad_norm": 0.5486199855804443, "learning_rate": 0.0002, "epoch": 3.0884808013355594, "step": 1850}, {"loss": 1.414, "grad_norm": 0.5496503710746765, "learning_rate": 0.0002, "epoch": 3.1051752921535893, "step": 1860}, {"loss": 1.5181, "grad_norm": 0.5602648258209229, "learning_rate": 0.0002, "epoch": 3.1218697829716193, "step": 1870}, {"loss": 1.5406, "grad_norm": 1.0697380304336548, "learning_rate": 0.0002, "epoch": 3.1385642737896493, "step": 1880}, {"loss": 1.4889, "grad_norm": 0.6087332367897034, "learning_rate": 0.0002, "epoch": 3.1552587646076793, "step": 1890}, {"loss": 1.5219, "grad_norm": 0.5112161040306091, "learning_rate": 0.0002, "epoch": 3.1719532554257097, "step": 1900}, {"loss": 1.5139, "grad_norm": 0.6393680572509766, "learning_rate": 0.0002, "epoch": 3.1886477462437397, "step": 1910}, {"loss": 1.5337, "grad_norm": 0.7201815247535706, "learning_rate": 0.0002, "epoch": 3.2053422370617697, "step": 1920}, {"loss": 1.6055, "grad_norm": 0.5856018662452698, "learning_rate": 0.0002, "epoch": 3.2220367278797997, "step": 1930}, {"loss": 1.4791, "grad_norm": 0.581247866153717, "learning_rate": 0.0002, "epoch": 3.2387312186978297, "step": 1940}, {"loss": 1.5395, "grad_norm": 0.6055102944374084, "learning_rate": 0.0002, "epoch": 3.2554257095158596, "step": 1950}, {"loss": 1.5086, "grad_norm": 0.546894371509552, "learning_rate": 0.0002, "epoch": 3.27212020033389, "step": 1960}, {"loss": 1.5712, "grad_norm": 0.565558910369873, "learning_rate": 0.0002, "epoch": 3.28881469115192, "step": 1970}, {"loss": 1.47, "grad_norm": 1.2238883972167969, "learning_rate": 0.0002, "epoch": 3.30550918196995, "step": 1980}, {"loss": 1.4655, "grad_norm": 0.6362585425376892, "learning_rate": 0.0002, "epoch": 3.32220367278798, "step": 1990}, {"loss": 1.5157, "grad_norm": 0.6131124496459961, "learning_rate": 0.0002, "epoch": 3.33889816360601, "step": 2000}, {"loss": 1.5322, "grad_norm": 0.5181341767311096, "learning_rate": 0.0002, "epoch": 3.35559265442404, "step": 2010}, {"loss": 1.5039, "grad_norm": 0.6667609810829163, "learning_rate": 0.0002, "epoch": 3.37228714524207, "step": 2020}, {"loss": 1.5814, "grad_norm": 0.6488749980926514, "learning_rate": 0.0002, "epoch": 3.3889816360601, "step": 2030}, {"loss": 1.5226, "grad_norm": 0.5693286061286926, "learning_rate": 0.0002, "epoch": 3.4056761268781304, "step": 2040}, {"loss": 1.5121, "grad_norm": 0.6154143810272217, "learning_rate": 0.0002, "epoch": 3.4223706176961604, "step": 2050}, {"loss": 1.6033, "grad_norm": 0.6747981309890747, "learning_rate": 0.0002, "epoch": 3.4390651085141903, "step": 2060}, {"loss": 1.5857, "grad_norm": 0.5494789481163025, "learning_rate": 0.0002, "epoch": 3.4557595993322203, "step": 2070}, {"loss": 1.5223, "grad_norm": 2.481968402862549, "learning_rate": 0.0002, "epoch": 3.4724540901502503, "step": 2080}, {"loss": 1.4989, "grad_norm": 0.589784562587738, "learning_rate": 0.0002, "epoch": 3.4891485809682803, "step": 2090}, {"loss": 1.6227, "grad_norm": 0.6449820399284363, "learning_rate": 0.0002, "epoch": 3.5058430717863107, "step": 2100}, {"loss": 1.588, "grad_norm": 0.6467038989067078, "learning_rate": 0.0002, "epoch": 3.5225375626043407, "step": 2110}, {"loss": 1.5655, "grad_norm": 0.6533533334732056, "learning_rate": 0.0002, "epoch": 3.5392320534223707, "step": 2120}, {"loss": 1.6052, "grad_norm": 0.6804035902023315, "learning_rate": 0.0002, "epoch": 3.5559265442404007, "step": 2130}, {"loss": 1.5408, "grad_norm": 0.628773033618927, "learning_rate": 0.0002, "epoch": 3.5726210350584306, "step": 2140}, {"loss": 1.5487, "grad_norm": 0.6055739521980286, "learning_rate": 0.0002, "epoch": 3.5893155258764606, "step": 2150}, {"loss": 1.5305, "grad_norm": 0.6000894904136658, "learning_rate": 0.0002, "epoch": 3.6060100166944906, "step": 2160}, {"loss": 1.4742, "grad_norm": 0.5862473249435425, "learning_rate": 0.0002, "epoch": 3.6227045075125206, "step": 2170}, {"loss": 1.503, "grad_norm": 0.6547419428825378, "learning_rate": 0.0002, "epoch": 3.639398998330551, "step": 2180}, {"loss": 1.4704, "grad_norm": 0.5610318779945374, "learning_rate": 0.0002, "epoch": 3.656093489148581, "step": 2190}, {"loss": 1.4814, "grad_norm": 0.6387564539909363, "learning_rate": 0.0002, "epoch": 3.672787979966611, "step": 2200}, {"loss": 1.5356, "grad_norm": 0.6065090894699097, "learning_rate": 0.0002, "epoch": 3.689482470784641, "step": 2210}, {"loss": 1.5074, "grad_norm": 0.6266646981239319, "learning_rate": 0.0002, "epoch": 3.706176961602671, "step": 2220}, {"loss": 1.5146, "grad_norm": 0.626944363117218, "learning_rate": 0.0002, "epoch": 3.7228714524207014, "step": 2230}, {"loss": 1.5131, "grad_norm": 0.6043975949287415, "learning_rate": 0.0002, "epoch": 3.7395659432387314, "step": 2240}, {"loss": 1.5929, "grad_norm": 0.599732518196106, "learning_rate": 0.0002, "epoch": 3.7562604340567614, "step": 2250}, {"loss": 1.5236, "grad_norm": 0.6738389134407043, "learning_rate": 0.0002, "epoch": 3.7729549248747913, "step": 2260}, {"loss": 1.5003, "grad_norm": 0.5561335682868958, "learning_rate": 0.0002, "epoch": 3.7896494156928213, "step": 2270}, {"loss": 1.5013, "grad_norm": 0.6185726523399353, "learning_rate": 0.0002, "epoch": 3.8063439065108513, "step": 2280}, {"loss": 1.4996, "grad_norm": 0.6151532530784607, "learning_rate": 0.0002, "epoch": 3.8230383973288813, "step": 2290}, {"loss": 1.5453, "grad_norm": 0.5808233022689819, "learning_rate": 0.0002, "epoch": 3.8397328881469113, "step": 2300}, {"loss": 1.5223, "grad_norm": 0.6615163683891296, "learning_rate": 0.0002, "epoch": 3.8564273789649417, "step": 2310}, {"loss": 1.4365, "grad_norm": 0.5832979679107666, "learning_rate": 0.0002, "epoch": 3.8731218697829717, "step": 2320}, {"loss": 1.6036, "grad_norm": 0.6119300127029419, "learning_rate": 0.0002, "epoch": 3.8898163606010017, "step": 2330}, {"loss": 1.5581, "grad_norm": 0.6489697694778442, "learning_rate": 0.0002, "epoch": 3.9065108514190316, "step": 2340}, {"loss": 1.5601, "grad_norm": 0.5539063215255737, "learning_rate": 0.0002, "epoch": 3.9232053422370616, "step": 2350}, {"loss": 1.5174, "grad_norm": 0.6062877178192139, "learning_rate": 0.0002, "epoch": 3.939899833055092, "step": 2360}, {"loss": 1.5168, "grad_norm": 0.680609941482544, "learning_rate": 0.0002, "epoch": 3.956594323873122, "step": 2370}, {"loss": 1.4875, "grad_norm": 0.6176834106445312, "learning_rate": 0.0002, "epoch": 3.973288814691152, "step": 2380}, {"loss": 1.4984, "grad_norm": 0.6538102030754089, "learning_rate": 0.0002, "epoch": 3.989983305509182, "step": 2390}, {"eval_loss": 1.8920671939849854, "eval_runtime": 76.5227, "eval_samples_per_second": 6.73, "eval_steps_per_second": 0.849, "epoch": 4.0, "step": 2396}, {"loss": 1.3926, "grad_norm": 0.5683762431144714, "learning_rate": 0.0002, "epoch": 4.006677796327212, "step": 2400}, {"loss": 1.3387, "grad_norm": 0.6858044862747192, "learning_rate": 0.0002, "epoch": 4.023372287145242, "step": 2410}, {"loss": 1.4495, "grad_norm": 0.7614858150482178, "learning_rate": 0.0002, "epoch": 4.040066777963272, "step": 2420}, {"loss": 1.2696, "grad_norm": 0.709412693977356, "learning_rate": 0.0002, "epoch": 4.056761268781302, "step": 2430}, {"loss": 1.3836, "grad_norm": 0.7070785760879517, "learning_rate": 0.0002, "epoch": 4.073455759599332, "step": 2440}, {"loss": 1.3527, "grad_norm": 0.8815216422080994, "learning_rate": 0.0002, "epoch": 4.090150250417362, "step": 2450}, {"loss": 1.3731, "grad_norm": 0.759981632232666, "learning_rate": 0.0002, "epoch": 4.106844741235392, "step": 2460}, {"loss": 1.3393, "grad_norm": 0.6715240478515625, "learning_rate": 0.0002, "epoch": 4.123539232053423, "step": 2470}, {"loss": 1.3934, "grad_norm": 0.7503564953804016, "learning_rate": 0.0002, "epoch": 4.140233722871453, "step": 2480}, {"loss": 1.324, "grad_norm": 0.773743748664856, "learning_rate": 0.0002, "epoch": 4.156928213689483, "step": 2490}, {"loss": 1.3782, "grad_norm": 0.8850100040435791, "learning_rate": 0.0002, "epoch": 4.173622704507513, "step": 2500}, {"loss": 1.3183, "grad_norm": 0.7575962543487549, "learning_rate": 0.0002, "epoch": 4.190317195325543, "step": 2510}, {"loss": 1.3673, "grad_norm": 0.9117498397827148, "learning_rate": 0.0002, "epoch": 4.207011686143573, "step": 2520}, {"loss": 1.3242, "grad_norm": 0.7637559175491333, "learning_rate": 0.0002, "epoch": 4.223706176961603, "step": 2530}, {"loss": 1.3764, "grad_norm": 0.8178390264511108, "learning_rate": 0.0002, "epoch": 4.240400667779633, "step": 2540}, {"loss": 1.3808, "grad_norm": 0.8299263119697571, "learning_rate": 0.0002, "epoch": 4.257095158597663, "step": 2550}, {"loss": 1.3637, "grad_norm": 0.7238091230392456, "learning_rate": 0.0002, "epoch": 4.273789649415693, "step": 2560}, {"loss": 1.349, "grad_norm": 0.7468036413192749, "learning_rate": 0.0002, "epoch": 4.290484140233723, "step": 2570}, {"loss": 1.4422, "grad_norm": 0.8012791275978088, "learning_rate": 0.0002, "epoch": 4.307178631051753, "step": 2580}, {"loss": 1.3723, "grad_norm": 0.8302484154701233, "learning_rate": 0.0002, "epoch": 4.323873121869783, "step": 2590}, {"loss": 1.4013, "grad_norm": 0.751864492893219, "learning_rate": 0.0002, "epoch": 4.340567612687813, "step": 2600}, {"loss": 1.3881, "grad_norm": 0.8025410175323486, "learning_rate": 0.0002, "epoch": 4.357262103505843, "step": 2610}, {"loss": 1.3831, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 4.373956594323873, "step": 2620}, {"loss": 1.3721, "grad_norm": 0.8526890873908997, "learning_rate": 0.0002, "epoch": 4.390651085141903, "step": 2630}, {"loss": 1.4253, "grad_norm": 1.0536625385284424, "learning_rate": 0.0002, "epoch": 4.407345575959933, "step": 2640}, {"loss": 1.3736, "grad_norm": 0.7223818898200989, "learning_rate": 0.0002, "epoch": 4.424040066777963, "step": 2650}, {"loss": 1.4652, "grad_norm": 0.7981253266334534, "learning_rate": 0.0002, "epoch": 4.440734557595993, "step": 2660}, {"loss": 1.3878, "grad_norm": 0.7136162519454956, "learning_rate": 0.0002, "epoch": 4.457429048414023, "step": 2670}, {"loss": 1.4242, "grad_norm": 0.8008312582969666, "learning_rate": 0.0002, "epoch": 4.474123539232053, "step": 2680}, {"loss": 1.3448, "grad_norm": 0.7924065589904785, "learning_rate": 0.0002, "epoch": 4.490818030050083, "step": 2690}, {"loss": 1.402, "grad_norm": 0.8224287629127502, "learning_rate": 0.0002, "epoch": 4.507512520868113, "step": 2700}, {"loss": 1.2841, "grad_norm": 0.7494375109672546, "learning_rate": 0.0002, "epoch": 4.524207011686143, "step": 2710}, {"loss": 1.4471, "grad_norm": 0.8097899556159973, "learning_rate": 0.0002, "epoch": 4.540901502504173, "step": 2720}, {"loss": 1.4116, "grad_norm": 0.7728819251060486, "learning_rate": 0.0002, "epoch": 4.557595993322204, "step": 2730}, {"loss": 1.3549, "grad_norm": 0.9112362265586853, "learning_rate": 0.0002, "epoch": 4.574290484140234, "step": 2740}, {"loss": 1.4601, "grad_norm": 0.7502672076225281, "learning_rate": 0.0002, "epoch": 4.590984974958264, "step": 2750}, {"loss": 1.4216, "grad_norm": 0.8816406726837158, "learning_rate": 0.0002, "epoch": 4.607679465776294, "step": 2760}, {"loss": 1.3233, "grad_norm": 0.7117180228233337, "learning_rate": 0.0002, "epoch": 4.624373956594324, "step": 2770}, {"loss": 1.3886, "grad_norm": 0.8224529027938843, "learning_rate": 0.0002, "epoch": 4.641068447412354, "step": 2780}, {"loss": 1.3756, "grad_norm": 0.7625266313552856, "learning_rate": 0.0002, "epoch": 4.657762938230384, "step": 2790}, {"loss": 1.3953, "grad_norm": 0.7754318118095398, "learning_rate": 0.0002, "epoch": 4.674457429048414, "step": 2800}, {"loss": 1.4102, "grad_norm": 0.7907336354255676, "learning_rate": 0.0002, "epoch": 4.691151919866444, "step": 2810}, {"loss": 1.3277, "grad_norm": 0.7377734780311584, "learning_rate": 0.0002, "epoch": 4.707846410684474, "step": 2820}, {"loss": 1.3686, "grad_norm": 0.7380456328392029, "learning_rate": 0.0002, "epoch": 4.724540901502504, "step": 2830}, {"loss": 1.4405, "grad_norm": 0.7148023247718811, "learning_rate": 0.0002, "epoch": 4.741235392320534, "step": 2840}, {"loss": 1.4025, "grad_norm": 0.807048499584198, "learning_rate": 0.0002, "epoch": 4.757929883138564, "step": 2850}, {"loss": 1.3195, "grad_norm": 0.8444154858589172, "learning_rate": 0.0002, "epoch": 4.774624373956595, "step": 2860}, {"loss": 1.4282, "grad_norm": 0.8328704237937927, "learning_rate": 0.0002, "epoch": 4.791318864774624, "step": 2870}, {"loss": 1.413, "grad_norm": 0.89827960729599, "learning_rate": 0.0002, "epoch": 4.808013355592655, "step": 2880}, {"loss": 1.4488, "grad_norm": 0.7848225831985474, "learning_rate": 0.0002, "epoch": 4.824707846410685, "step": 2890}, {"loss": 1.3757, "grad_norm": 0.703802227973938, "learning_rate": 0.0002, "epoch": 4.841402337228715, "step": 2900}, {"loss": 1.4404, "grad_norm": 0.8092581629753113, "learning_rate": 0.0002, "epoch": 4.858096828046745, "step": 2910}, {"loss": 1.3812, "grad_norm": 0.7537722587585449, "learning_rate": 0.0002, "epoch": 4.874791318864775, "step": 2920}, {"loss": 1.4499, "grad_norm": 0.7966470122337341, "learning_rate": 0.0002, "epoch": 4.891485809682805, "step": 2930}, {"loss": 1.3922, "grad_norm": 0.7860329747200012, "learning_rate": 0.0002, "epoch": 4.908180300500835, "step": 2940}, {"loss": 1.4224, "grad_norm": 0.7964439988136292, "learning_rate": 0.0002, "epoch": 4.924874791318865, "step": 2950}, {"loss": 1.3869, "grad_norm": 0.740288257598877, "learning_rate": 0.0002, "epoch": 4.941569282136895, "step": 2960}, {"loss": 1.4321, "grad_norm": 0.7377685904502869, "learning_rate": 0.0002, "epoch": 4.958263772954925, "step": 2970}, {"loss": 1.4253, "grad_norm": 0.793484628200531, "learning_rate": 0.0002, "epoch": 4.974958263772955, "step": 2980}, {"loss": 1.3966, "grad_norm": 0.7710573077201843, "learning_rate": 0.0002, "epoch": 4.9916527545909855, "step": 2990}, {"eval_loss": 1.9764225482940674, "eval_runtime": 87.968, "eval_samples_per_second": 5.854, "eval_steps_per_second": 0.739, "epoch": 5.0, "step": 2995}, {"loss": 1.3493, "grad_norm": 0.680841326713562, "learning_rate": 0.0002, "epoch": 5.008347245409015, "step": 3000}, {"loss": 1.2462, "grad_norm": 0.8790825009346008, "learning_rate": 0.0002, "epoch": 5.025041736227045, "step": 3010}, {"loss": 1.2514, "grad_norm": 1.1519404649734497, "learning_rate": 0.0002, "epoch": 5.041736227045075, "step": 3020}, {"loss": 1.224, "grad_norm": 1.1939337253570557, "learning_rate": 0.0002, "epoch": 5.058430717863105, "step": 3030}, {"loss": 1.1274, "grad_norm": 1.1471049785614014, "learning_rate": 0.0002, "epoch": 5.075125208681135, "step": 3040}, {"loss": 1.1726, "grad_norm": 1.0808285474777222, "learning_rate": 0.0002, "epoch": 5.091819699499165, "step": 3050}, {"loss": 1.1644, "grad_norm": 1.0102492570877075, "learning_rate": 0.0002, "epoch": 5.108514190317195, "step": 3060}, {"loss": 1.1652, "grad_norm": 0.9869397282600403, "learning_rate": 0.0002, "epoch": 5.125208681135225, "step": 3070}, {"loss": 1.1997, "grad_norm": 0.9689525365829468, "learning_rate": 0.0002, "epoch": 5.141903171953255, "step": 3080}, {"loss": 1.1747, "grad_norm": 0.9293769598007202, "learning_rate": 0.0002, "epoch": 5.158597662771285, "step": 3090}, {"loss": 1.1728, "grad_norm": 0.9289103150367737, "learning_rate": 0.0002, "epoch": 5.175292153589315, "step": 3100}, {"loss": 1.2538, "grad_norm": 0.9736173152923584, "learning_rate": 0.0002, "epoch": 5.191986644407345, "step": 3110}, {"loss": 1.2429, "grad_norm": 1.3144289255142212, "learning_rate": 0.0002, "epoch": 5.208681135225375, "step": 3120}, {"loss": 1.2107, "grad_norm": 0.95982825756073, "learning_rate": 0.0002, "epoch": 5.225375626043405, "step": 3130}, {"loss": 1.2239, "grad_norm": 0.903189480304718, "learning_rate": 0.0002, "epoch": 5.242070116861436, "step": 3140}, {"loss": 1.2663, "grad_norm": 1.056692123413086, "learning_rate": 0.0002, "epoch": 5.258764607679466, "step": 3150}, {"loss": 1.2955, "grad_norm": 1.1169359683990479, "learning_rate": 0.0002, "epoch": 5.275459098497496, "step": 3160}, {"loss": 1.1559, "grad_norm": 1.2178374528884888, "learning_rate": 0.0002, "epoch": 5.292153589315526, "step": 3170}, {"loss": 1.2394, "grad_norm": 0.9956373572349548, "learning_rate": 0.0002, "epoch": 5.308848080133556, "step": 3180}, {"loss": 1.1792, "grad_norm": 0.959555447101593, "learning_rate": 0.0002, "epoch": 5.325542570951586, "step": 3190}, {"loss": 1.1817, "grad_norm": 0.9343846440315247, "learning_rate": 0.0002, "epoch": 5.342237061769616, "step": 3200}, {"loss": 1.2033, "grad_norm": 0.8806524872779846, "learning_rate": 0.0002, "epoch": 5.358931552587646, "step": 3210}, {"loss": 1.2511, "grad_norm": 0.9477803111076355, "learning_rate": 0.0002, "epoch": 5.375626043405676, "step": 3220}, {"loss": 1.2011, "grad_norm": 0.9975674152374268, "learning_rate": 0.0002, "epoch": 5.392320534223706, "step": 3230}, {"loss": 1.3012, "grad_norm": 0.9650071263313293, "learning_rate": 0.0002, "epoch": 5.409015025041736, "step": 3240}, {"loss": 1.2281, "grad_norm": 1.0170838832855225, "learning_rate": 0.0002, "epoch": 5.425709515859766, "step": 3250}, {"loss": 1.2635, "grad_norm": 1.158118486404419, "learning_rate": 0.0002, "epoch": 5.442404006677796, "step": 3260}, {"loss": 1.3333, "grad_norm": 1.0228497982025146, "learning_rate": 0.0002, "epoch": 5.459098497495827, "step": 3270}, {"loss": 1.1961, "grad_norm": 1.0101768970489502, "learning_rate": 0.0002, "epoch": 5.475792988313857, "step": 3280}, {"loss": 1.3058, "grad_norm": 1.0407295227050781, "learning_rate": 0.0002, "epoch": 5.492487479131887, "step": 3290}, {"loss": 1.2062, "grad_norm": 0.9337932467460632, "learning_rate": 0.0002, "epoch": 5.509181969949917, "step": 3300}, {"loss": 1.2241, "grad_norm": 1.0305527448654175, "learning_rate": 0.0002, "epoch": 5.525876460767947, "step": 3310}, {"loss": 1.2524, "grad_norm": 1.0523453950881958, "learning_rate": 0.0002, "epoch": 5.542570951585977, "step": 3320}, {"loss": 1.2526, "grad_norm": 0.9707391858100891, "learning_rate": 0.0002, "epoch": 5.559265442404007, "step": 3330}, {"loss": 1.3002, "grad_norm": 1.0054972171783447, "learning_rate": 0.0002, "epoch": 5.575959933222037, "step": 3340}, {"loss": 1.2459, "grad_norm": 1.0393340587615967, "learning_rate": 0.0002, "epoch": 5.592654424040067, "step": 3350}, {"loss": 1.2328, "grad_norm": 1.0671277046203613, "learning_rate": 0.0002, "epoch": 5.609348914858097, "step": 3360}, {"loss": 1.2415, "grad_norm": 1.0725873708724976, "learning_rate": 0.0002, "epoch": 5.626043405676127, "step": 3370}, {"loss": 1.2475, "grad_norm": 0.9844746589660645, "learning_rate": 0.0002, "epoch": 5.642737896494157, "step": 3380}, {"loss": 1.1997, "grad_norm": 0.9659736752510071, "learning_rate": 0.0002, "epoch": 5.659432387312187, "step": 3390}, {"loss": 1.2426, "grad_norm": 0.9152608513832092, "learning_rate": 0.0002, "epoch": 5.676126878130217, "step": 3400}, {"loss": 1.2424, "grad_norm": 0.9759509563446045, "learning_rate": 0.0002, "epoch": 5.692821368948247, "step": 3410}, {"loss": 1.2264, "grad_norm": 1.0662057399749756, "learning_rate": 0.0002, "epoch": 5.709515859766277, "step": 3420}, {"loss": 1.19, "grad_norm": 0.9780185222625732, "learning_rate": 0.0002, "epoch": 5.726210350584307, "step": 3430}, {"loss": 1.2603, "grad_norm": 0.9781617522239685, "learning_rate": 0.0002, "epoch": 5.742904841402337, "step": 3440}, {"loss": 1.2472, "grad_norm": 1.0790785551071167, "learning_rate": 0.0002, "epoch": 5.759599332220367, "step": 3450}, {"loss": 1.2697, "grad_norm": 1.0573410987854004, "learning_rate": 0.0002, "epoch": 5.776293823038397, "step": 3460}, {"loss": 1.2591, "grad_norm": 0.9953364729881287, "learning_rate": 0.0002, "epoch": 5.792988313856427, "step": 3470}, {"loss": 1.2361, "grad_norm": 1.0072667598724365, "learning_rate": 0.0002, "epoch": 5.809682804674457, "step": 3480}, {"loss": 1.286, "grad_norm": 0.9312750697135925, "learning_rate": 0.0002, "epoch": 5.826377295492487, "step": 3490}, {"loss": 1.2379, "grad_norm": 1.059614896774292, "learning_rate": 0.0002, "epoch": 5.843071786310517, "step": 3500}, {"loss": 1.2323, "grad_norm": 1.2089484930038452, "learning_rate": 0.0002, "epoch": 5.859766277128547, "step": 3510}, {"loss": 1.2047, "grad_norm": 1.0740607976913452, "learning_rate": 0.0002, "epoch": 5.876460767946577, "step": 3520}, {"loss": 1.2809, "grad_norm": 0.9620149731636047, "learning_rate": 0.0002, "epoch": 5.893155258764608, "step": 3530}, {"loss": 1.238, "grad_norm": 1.0482431650161743, "learning_rate": 0.0002, "epoch": 5.909849749582638, "step": 3540}, {"loss": 1.2621, "grad_norm": 0.9137503504753113, "learning_rate": 0.0002, "epoch": 5.926544240400668, "step": 3550}, {"loss": 1.3066, "grad_norm": 1.1599403619766235, "learning_rate": 0.0002, "epoch": 5.943238731218698, "step": 3560}, {"loss": 1.2556, "grad_norm": 0.911613404750824, "learning_rate": 0.0002, "epoch": 5.959933222036728, "step": 3570}, {"loss": 1.2746, "grad_norm": 0.9120033383369446, "learning_rate": 0.0002, "epoch": 5.976627712854758, "step": 3580}, {"loss": 1.2815, "grad_norm": 1.0588736534118652, "learning_rate": 0.0002, "epoch": 5.993322203672788, "step": 3590}, {"eval_loss": 2.0921614170074463, "eval_runtime": 71.974, "eval_samples_per_second": 7.155, "eval_steps_per_second": 0.903, "epoch": 6.0, "step": 3594}, {"loss": 1.1397, "grad_norm": 0.9213348627090454, "learning_rate": 0.0002, "epoch": 6.010016694490818, "step": 3600}, {"loss": 1.07, "grad_norm": 1.137640357017517, "learning_rate": 0.0002, "epoch": 6.026711185308848, "step": 3610}, {"loss": 0.9953, "grad_norm": 1.200276494026184, "learning_rate": 0.0002, "epoch": 6.043405676126878, "step": 3620}, {"loss": 1.0356, "grad_norm": 1.335649013519287, "learning_rate": 0.0002, "epoch": 6.060100166944908, "step": 3630}, {"loss": 1.1154, "grad_norm": 1.1353906393051147, "learning_rate": 0.0002, "epoch": 6.076794657762938, "step": 3640}, {"loss": 1.0481, "grad_norm": 1.0406795740127563, "learning_rate": 0.0002, "epoch": 6.093489148580968, "step": 3650}, {"loss": 1.0594, "grad_norm": 1.2691017389297485, "learning_rate": 0.0002, "epoch": 6.110183639398999, "step": 3660}, {"loss": 1.0594, "grad_norm": 1.3334898948669434, "learning_rate": 0.0002, "epoch": 6.126878130217029, "step": 3670}, {"loss": 1.0186, "grad_norm": 1.1766020059585571, "learning_rate": 0.0002, "epoch": 6.143572621035059, "step": 3680}, {"loss": 1.0431, "grad_norm": 1.1079157590866089, "learning_rate": 0.0002, "epoch": 6.160267111853089, "step": 3690}, {"loss": 1.0395, "grad_norm": 1.4312299489974976, "learning_rate": 0.0002, "epoch": 6.176961602671119, "step": 3700}, {"loss": 1.1095, "grad_norm": 1.2636224031448364, "learning_rate": 0.0002, "epoch": 6.193656093489149, "step": 3710}, {"loss": 1.0669, "grad_norm": 1.1957253217697144, "learning_rate": 0.0002, "epoch": 6.210350584307179, "step": 3720}, {"loss": 1.0199, "grad_norm": 1.1044131517410278, "learning_rate": 0.0002, "epoch": 6.227045075125209, "step": 3730}, {"loss": 1.0316, "grad_norm": 1.2045193910598755, "learning_rate": 0.0002, "epoch": 6.243739565943239, "step": 3740}, {"loss": 1.1218, "grad_norm": 1.0740957260131836, "learning_rate": 0.0002, "epoch": 6.260434056761269, "step": 3750}, {"loss": 1.0271, "grad_norm": 1.1548833847045898, "learning_rate": 0.0002, "epoch": 6.277128547579299, "step": 3760}, {"loss": 1.14, "grad_norm": 1.257440209388733, "learning_rate": 0.0002, "epoch": 6.293823038397329, "step": 3770}, {"loss": 1.0762, "grad_norm": 1.1988940238952637, "learning_rate": 0.0002, "epoch": 6.310517529215359, "step": 3780}, {"loss": 1.0627, "grad_norm": 1.1707229614257812, "learning_rate": 0.0002, "epoch": 6.3272120200333895, "step": 3790}, {"loss": 1.053, "grad_norm": 1.360107660293579, "learning_rate": 0.0002, "epoch": 6.343906510851419, "step": 3800}, {"loss": 1.0637, "grad_norm": 1.249742031097412, "learning_rate": 0.0002, "epoch": 6.360601001669449, "step": 3810}, {"loss": 1.0521, "grad_norm": 1.2729560136795044, "learning_rate": 0.0002, "epoch": 6.377295492487479, "step": 3820}, {"loss": 1.1217, "grad_norm": 1.241761565208435, "learning_rate": 0.0002, "epoch": 6.393989983305509, "step": 3830}, {"loss": 1.0648, "grad_norm": 1.1892873048782349, "learning_rate": 0.0002, "epoch": 6.410684474123539, "step": 3840}, {"loss": 1.1092, "grad_norm": 1.1766357421875, "learning_rate": 0.0002, "epoch": 6.427378964941569, "step": 3850}, {"loss": 1.0872, "grad_norm": 1.2642168998718262, "learning_rate": 0.0002, "epoch": 6.444073455759599, "step": 3860}, {"loss": 1.0748, "grad_norm": 1.3390182256698608, "learning_rate": 0.0002, "epoch": 6.460767946577629, "step": 3870}, {"loss": 1.0657, "grad_norm": 1.183168649673462, "learning_rate": 0.0002, "epoch": 6.477462437395659, "step": 3880}, {"loss": 1.0696, "grad_norm": 1.1458892822265625, "learning_rate": 0.0002, "epoch": 6.494156928213689, "step": 3890}, {"loss": 1.1625, "grad_norm": 1.2736095190048218, "learning_rate": 0.0002, "epoch": 6.510851419031719, "step": 3900}, {"loss": 1.1175, "grad_norm": 1.323607087135315, "learning_rate": 0.0002, "epoch": 6.527545909849749, "step": 3910}, {"loss": 1.1258, "grad_norm": 1.2177817821502686, "learning_rate": 0.0002, "epoch": 6.54424040066778, "step": 3920}, {"loss": 1.0333, "grad_norm": 1.3270750045776367, "learning_rate": 0.0002, "epoch": 6.560934891485809, "step": 3930}, {"loss": 1.0589, "grad_norm": 1.0974372625350952, "learning_rate": 0.0002, "epoch": 6.57762938230384, "step": 3940}, {"loss": 1.1347, "grad_norm": 1.3352670669555664, "learning_rate": 0.0002, "epoch": 6.59432387312187, "step": 3950}, {"loss": 1.0684, "grad_norm": 1.3174126148223877, "learning_rate": 0.0002, "epoch": 6.6110183639399, "step": 3960}, {"loss": 1.1697, "grad_norm": 1.1783626079559326, "learning_rate": 0.0002, "epoch": 6.62771285475793, "step": 3970}, {"loss": 1.1256, "grad_norm": 1.1886446475982666, "learning_rate": 0.0002, "epoch": 6.64440734557596, "step": 3980}, {"loss": 1.1066, "grad_norm": 1.2215187549591064, "learning_rate": 0.0002, "epoch": 6.66110183639399, "step": 3990}, {"loss": 1.1236, "grad_norm": 1.0320725440979004, "learning_rate": 0.0002, "epoch": 6.67779632721202, "step": 4000}, {"loss": 1.0828, "grad_norm": 1.340338110923767, "learning_rate": 0.0002, "epoch": 6.69449081803005, "step": 4010}, {"loss": 1.0942, "grad_norm": 1.1496273279190063, "learning_rate": 0.0002, "epoch": 6.71118530884808, "step": 4020}, {"loss": 1.1465, "grad_norm": 1.5720409154891968, "learning_rate": 0.0002, "epoch": 6.72787979966611, "step": 4030}, {"loss": 1.1385, "grad_norm": 1.497376799583435, "learning_rate": 0.0002, "epoch": 6.74457429048414, "step": 4040}, {"loss": 1.0808, "grad_norm": 1.1594456434249878, "learning_rate": 0.0002, "epoch": 6.76126878130217, "step": 4050}, {"loss": 1.1541, "grad_norm": 1.326546549797058, "learning_rate": 0.0002, "epoch": 6.7779632721202, "step": 4060}, {"loss": 1.1314, "grad_norm": 1.18723726272583, "learning_rate": 0.0002, "epoch": 6.794657762938231, "step": 4070}, {"loss": 1.1906, "grad_norm": 1.2974154949188232, "learning_rate": 0.0002, "epoch": 6.811352253756261, "step": 4080}, {"loss": 1.0534, "grad_norm": 1.207748532295227, "learning_rate": 0.0002, "epoch": 6.828046744574291, "step": 4090}, {"loss": 1.0951, "grad_norm": 1.2398537397384644, "learning_rate": 0.0002, "epoch": 6.844741235392321, "step": 4100}, {"loss": 1.1348, "grad_norm": 1.1657508611679077, "learning_rate": 0.0002, "epoch": 6.861435726210351, "step": 4110}, {"loss": 1.1315, "grad_norm": 1.1986382007598877, "learning_rate": 0.0002, "epoch": 6.878130217028381, "step": 4120}, {"loss": 1.0781, "grad_norm": 1.407080054283142, "learning_rate": 0.0002, "epoch": 6.894824707846411, "step": 4130}, {"loss": 1.0515, "grad_norm": 1.0725297927856445, "learning_rate": 0.0002, "epoch": 6.911519198664441, "step": 4140}, {"loss": 1.1602, "grad_norm": 1.2659991979599, "learning_rate": 0.0002, "epoch": 6.928213689482471, "step": 4150}, {"loss": 1.1373, "grad_norm": 1.0579404830932617, "learning_rate": 0.0002, "epoch": 6.944908180300501, "step": 4160}, {"loss": 1.1441, "grad_norm": 1.254502296447754, "learning_rate": 0.0002, "epoch": 6.961602671118531, "step": 4170}, {"loss": 1.1019, "grad_norm": 1.2666021585464478, "learning_rate": 0.0002, "epoch": 6.978297161936561, "step": 4180}, {"loss": 1.0675, "grad_norm": 1.236793041229248, "learning_rate": 0.0002, "epoch": 6.994991652754591, "step": 4190}]} +{"epoch": 8.0, "step": 4792, "epoch_duration": 1229.9026808738708, "total_accumulated_duration": 11590.543586015701, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-0/checkpoint-1198", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6298, "grad_norm": 0.4811326861381531, "learning_rate": 0.0002, "epoch": 0.01669449081803005, "step": 10}, {"loss": 2.2673, "grad_norm": 0.697903573513031, "learning_rate": 0.0002, "epoch": 0.0333889816360601, "step": 20}, {"loss": 2.0746, "grad_norm": 0.5622886419296265, "learning_rate": 0.0002, "epoch": 0.05008347245409015, "step": 30}, {"loss": 1.9808, "grad_norm": 0.4684421122074127, "learning_rate": 0.0002, "epoch": 0.0667779632721202, "step": 40}, {"loss": 1.9796, "grad_norm": 0.4790354371070862, "learning_rate": 0.0002, "epoch": 0.08347245409015025, "step": 50}, {"loss": 1.9269, "grad_norm": 0.5846750140190125, "learning_rate": 0.0002, "epoch": 0.1001669449081803, "step": 60}, {"loss": 1.9773, "grad_norm": 0.4034216105937958, "learning_rate": 0.0002, "epoch": 0.11686143572621036, "step": 70}, {"loss": 1.8688, "grad_norm": 0.4602500796318054, "learning_rate": 0.0002, "epoch": 0.1335559265442404, "step": 80}, {"loss": 1.9703, "grad_norm": 0.46994853019714355, "learning_rate": 0.0002, "epoch": 0.15025041736227046, "step": 90}, {"loss": 1.8443, "grad_norm": 0.3892269730567932, "learning_rate": 0.0002, "epoch": 0.1669449081803005, "step": 100}, {"loss": 1.9192, "grad_norm": 0.40771016478538513, "learning_rate": 0.0002, "epoch": 0.18363939899833054, "step": 110}, {"loss": 1.8292, "grad_norm": 0.3820408880710602, "learning_rate": 0.0002, "epoch": 0.2003338898163606, "step": 120}, {"loss": 1.8313, "grad_norm": 0.3719843626022339, "learning_rate": 0.0002, "epoch": 0.21702838063439064, "step": 130}, {"loss": 1.8189, "grad_norm": 0.4359976351261139, "learning_rate": 0.0002, "epoch": 0.2337228714524207, "step": 140}, {"loss": 1.8125, "grad_norm": 0.3932259976863861, "learning_rate": 0.0002, "epoch": 0.25041736227045075, "step": 150}, {"loss": 1.8681, "grad_norm": 0.7001785635948181, "learning_rate": 0.0002, "epoch": 0.2671118530884808, "step": 160}, {"loss": 1.9328, "grad_norm": 0.7619664669036865, "learning_rate": 0.0002, "epoch": 0.2838063439065108, "step": 170}, {"loss": 1.7572, "grad_norm": 0.3715350329875946, "learning_rate": 0.0002, "epoch": 0.3005008347245409, "step": 180}, {"loss": 1.8551, "grad_norm": 0.5008004903793335, "learning_rate": 0.0002, "epoch": 0.31719532554257096, "step": 190}, {"loss": 1.8469, "grad_norm": 0.47509506344795227, "learning_rate": 0.0002, "epoch": 0.333889816360601, "step": 200}, {"loss": 1.9042, "grad_norm": 0.41775935888290405, "learning_rate": 0.0002, "epoch": 0.35058430717863104, "step": 210}, {"loss": 1.7995, "grad_norm": 0.43939948081970215, "learning_rate": 0.0002, "epoch": 0.3672787979966611, "step": 220}, {"loss": 1.8535, "grad_norm": 0.5101977586746216, "learning_rate": 0.0002, "epoch": 0.38397328881469117, "step": 230}, {"loss": 1.8261, "grad_norm": 0.37367475032806396, "learning_rate": 0.0002, "epoch": 0.4006677796327212, "step": 240}, {"loss": 1.8361, "grad_norm": 0.38865089416503906, "learning_rate": 0.0002, "epoch": 0.41736227045075125, "step": 250}, {"loss": 1.8044, "grad_norm": 0.33937838673591614, "learning_rate": 0.0002, "epoch": 0.4340567612687813, "step": 260}, {"loss": 1.8288, "grad_norm": 0.41416028141975403, "learning_rate": 0.0002, "epoch": 0.4507512520868113, "step": 270}, {"loss": 1.8542, "grad_norm": 0.4010271430015564, "learning_rate": 0.0002, "epoch": 0.4674457429048414, "step": 280}, {"loss": 1.8803, "grad_norm": 0.3960907459259033, "learning_rate": 0.0002, "epoch": 0.48414023372287146, "step": 290}, {"loss": 1.8875, "grad_norm": 0.357433021068573, "learning_rate": 0.0002, "epoch": 0.5008347245409015, "step": 300}, {"loss": 1.8478, "grad_norm": 0.38190674781799316, "learning_rate": 0.0002, "epoch": 0.5175292153589316, "step": 310}, {"loss": 1.8142, "grad_norm": 0.3336802124977112, "learning_rate": 0.0002, "epoch": 0.5342237061769616, "step": 320}, {"loss": 1.82, "grad_norm": 0.35935860872268677, "learning_rate": 0.0002, "epoch": 0.5509181969949917, "step": 330}, {"loss": 1.854, "grad_norm": 0.3950583040714264, "learning_rate": 0.0002, "epoch": 0.5676126878130217, "step": 340}, {"loss": 1.8089, "grad_norm": 0.31413400173187256, "learning_rate": 0.0002, "epoch": 0.5843071786310517, "step": 350}, {"loss": 1.8417, "grad_norm": 0.3342890441417694, "learning_rate": 0.0002, "epoch": 0.6010016694490818, "step": 360}, {"loss": 1.766, "grad_norm": 0.36961331963539124, "learning_rate": 0.0002, "epoch": 0.6176961602671118, "step": 370}, {"loss": 1.8264, "grad_norm": 0.350652813911438, "learning_rate": 0.0002, "epoch": 0.6343906510851419, "step": 380}, {"loss": 1.7797, "grad_norm": 0.3588177263736725, "learning_rate": 0.0002, "epoch": 0.6510851419031719, "step": 390}, {"loss": 1.6967, "grad_norm": 0.3327147960662842, "learning_rate": 0.0002, "epoch": 0.667779632721202, "step": 400}, {"loss": 1.754, "grad_norm": 0.3632844388484955, "learning_rate": 0.0002, "epoch": 0.6844741235392321, "step": 410}, {"loss": 1.8264, "grad_norm": 0.34581053256988525, "learning_rate": 0.0002, "epoch": 0.7011686143572621, "step": 420}, {"loss": 1.8706, "grad_norm": 0.37237727642059326, "learning_rate": 0.0002, "epoch": 0.7178631051752922, "step": 430}, {"loss": 1.711, "grad_norm": 0.48366475105285645, "learning_rate": 0.0002, "epoch": 0.7345575959933222, "step": 440}, {"loss": 1.8658, "grad_norm": 0.3512793183326721, "learning_rate": 0.0002, "epoch": 0.7512520868113522, "step": 450}, {"loss": 1.7687, "grad_norm": 0.30473145842552185, "learning_rate": 0.0002, "epoch": 0.7679465776293823, "step": 460}, {"loss": 1.789, "grad_norm": 0.3718157112598419, "learning_rate": 0.0002, "epoch": 0.7846410684474123, "step": 470}, {"loss": 1.8527, "grad_norm": 0.34506872296333313, "learning_rate": 0.0002, "epoch": 0.8013355592654424, "step": 480}, {"loss": 1.8163, "grad_norm": 0.36895203590393066, "learning_rate": 0.0002, "epoch": 0.8180300500834724, "step": 490}, {"loss": 1.8499, "grad_norm": 0.35659778118133545, "learning_rate": 0.0002, "epoch": 0.8347245409015025, "step": 500}, {"loss": 1.8096, "grad_norm": 0.3631179928779602, "learning_rate": 0.0002, "epoch": 0.8514190317195326, "step": 510}, {"loss": 1.8003, "grad_norm": 0.3252873420715332, "learning_rate": 0.0002, "epoch": 0.8681135225375626, "step": 520}, {"loss": 1.7831, "grad_norm": 0.32796111702919006, "learning_rate": 0.0002, "epoch": 0.8848080133555927, "step": 530}, {"loss": 1.7675, "grad_norm": 0.3556145131587982, "learning_rate": 0.0002, "epoch": 0.9015025041736227, "step": 540}, {"loss": 1.7902, "grad_norm": 0.33029764890670776, "learning_rate": 0.0002, "epoch": 0.9181969949916527, "step": 550}, {"loss": 1.7932, "grad_norm": 0.3531745970249176, "learning_rate": 0.0002, "epoch": 0.9348914858096828, "step": 560}, {"loss": 1.7982, "grad_norm": 0.3486989140510559, "learning_rate": 0.0002, "epoch": 0.9515859766277128, "step": 570}, {"loss": 1.8522, "grad_norm": 0.34676939249038696, "learning_rate": 0.0002, "epoch": 0.9682804674457429, "step": 580}, {"loss": 1.7805, "grad_norm": 0.3389652669429779, "learning_rate": 0.0002, "epoch": 0.9849749582637729, "step": 590}, {"eval_loss": 1.8236571550369263, "eval_runtime": 77.157, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.842, "epoch": 1.0, "step": 599}, {"loss": 1.7724, "grad_norm": 0.6550514698028564, "learning_rate": 0.0002, "epoch": 1.001669449081803, "step": 600}, {"loss": 1.7601, "grad_norm": 0.36824166774749756, "learning_rate": 0.0002, "epoch": 1.018363939899833, "step": 610}, {"loss": 1.7684, "grad_norm": 0.34707099199295044, "learning_rate": 0.0002, "epoch": 1.0350584307178632, "step": 620}, {"loss": 1.7552, "grad_norm": 0.38599663972854614, "learning_rate": 0.0002, "epoch": 1.0517529215358932, "step": 630}, {"loss": 1.7396, "grad_norm": 0.34381693601608276, "learning_rate": 0.0002, "epoch": 1.0684474123539232, "step": 640}, {"loss": 1.7621, "grad_norm": 0.3657481372356415, "learning_rate": 0.0002, "epoch": 1.0851419031719534, "step": 650}, {"loss": 1.723, "grad_norm": 0.3310803771018982, "learning_rate": 0.0002, "epoch": 1.1018363939899833, "step": 660}, {"loss": 1.7788, "grad_norm": 0.37122875452041626, "learning_rate": 0.0002, "epoch": 1.1185308848080133, "step": 670}, {"loss": 1.6887, "grad_norm": 0.3976633548736572, "learning_rate": 0.0002, "epoch": 1.1352253756260433, "step": 680}, {"loss": 1.7398, "grad_norm": 0.37567615509033203, "learning_rate": 0.0002, "epoch": 1.1519198664440735, "step": 690}, {"loss": 1.7326, "grad_norm": 0.3683645725250244, "learning_rate": 0.0002, "epoch": 1.1686143572621035, "step": 700}, {"loss": 1.8074, "grad_norm": 0.3862009644508362, "learning_rate": 0.0002, "epoch": 1.1853088480801335, "step": 710}, {"loss": 1.7993, "grad_norm": 0.3478439450263977, "learning_rate": 0.0002, "epoch": 1.2020033388981637, "step": 720}, {"loss": 1.7945, "grad_norm": 0.3694932162761688, "learning_rate": 0.0002, "epoch": 1.2186978297161937, "step": 730}, {"loss": 1.7269, "grad_norm": 0.3661787211894989, "learning_rate": 0.0002, "epoch": 1.2353923205342237, "step": 740}, {"loss": 1.7561, "grad_norm": 0.372951865196228, "learning_rate": 0.0002, "epoch": 1.2520868113522536, "step": 750}, {"loss": 1.7507, "grad_norm": 0.38718998432159424, "learning_rate": 0.0002, "epoch": 1.2687813021702838, "step": 760}, {"loss": 1.7331, "grad_norm": 0.37488260865211487, "learning_rate": 0.0002, "epoch": 1.2854757929883138, "step": 770}, {"loss": 1.7548, "grad_norm": 0.34794917702674866, "learning_rate": 0.0002, "epoch": 1.302170283806344, "step": 780}, {"loss": 1.7592, "grad_norm": 0.3627476990222931, "learning_rate": 0.0002, "epoch": 1.318864774624374, "step": 790}, {"loss": 1.6837, "grad_norm": 0.3773096799850464, "learning_rate": 0.0002, "epoch": 1.335559265442404, "step": 800}, {"loss": 1.7448, "grad_norm": 0.36476725339889526, "learning_rate": 0.0002, "epoch": 1.352253756260434, "step": 810}, {"loss": 1.7958, "grad_norm": 0.3767942190170288, "learning_rate": 0.0002, "epoch": 1.3689482470784642, "step": 820}, {"loss": 1.7241, "grad_norm": 0.3502795398235321, "learning_rate": 0.0002, "epoch": 1.3856427378964942, "step": 830}, {"loss": 1.736, "grad_norm": 0.4008622169494629, "learning_rate": 0.0002, "epoch": 1.4023372287145242, "step": 840}, {"loss": 1.7211, "grad_norm": 0.4029707610607147, "learning_rate": 0.0002, "epoch": 1.4190317195325544, "step": 850}, {"loss": 1.712, "grad_norm": 0.41480565071105957, "learning_rate": 0.0002, "epoch": 1.4357262103505843, "step": 860}, {"loss": 1.7287, "grad_norm": 0.4351646900177002, "learning_rate": 0.0002, "epoch": 1.4524207011686143, "step": 870}, {"loss": 1.8436, "grad_norm": 0.4053232967853546, "learning_rate": 0.0002, "epoch": 1.4691151919866443, "step": 880}, {"loss": 1.7669, "grad_norm": 0.3515186607837677, "learning_rate": 0.0002, "epoch": 1.4858096828046745, "step": 890}, {"loss": 1.698, "grad_norm": 0.42895469069480896, "learning_rate": 0.0002, "epoch": 1.5025041736227045, "step": 900}, {"loss": 1.7064, "grad_norm": 0.40897831320762634, "learning_rate": 0.0002, "epoch": 1.5191986644407347, "step": 910}, {"loss": 1.7539, "grad_norm": 0.3544739782810211, "learning_rate": 0.0002, "epoch": 1.5358931552587647, "step": 920}, {"loss": 1.7596, "grad_norm": 0.3848305642604828, "learning_rate": 0.0002, "epoch": 1.5525876460767947, "step": 930}, {"loss": 1.7344, "grad_norm": 0.36952173709869385, "learning_rate": 0.0002, "epoch": 1.5692821368948247, "step": 940}, {"loss": 1.7329, "grad_norm": 0.36505743861198425, "learning_rate": 0.0002, "epoch": 1.5859766277128546, "step": 950}, {"loss": 1.6828, "grad_norm": 0.3707764446735382, "learning_rate": 0.0002, "epoch": 1.6026711185308848, "step": 960}, {"loss": 1.7465, "grad_norm": 0.35995468497276306, "learning_rate": 0.0002, "epoch": 1.6193656093489148, "step": 970}, {"loss": 1.7537, "grad_norm": 0.35458096861839294, "learning_rate": 0.0002, "epoch": 1.636060100166945, "step": 980}, {"loss": 1.7075, "grad_norm": 0.3557756841182709, "learning_rate": 0.0002, "epoch": 1.652754590984975, "step": 990}, {"loss": 1.7307, "grad_norm": 0.355899453163147, "learning_rate": 0.0002, "epoch": 1.669449081803005, "step": 1000}, {"loss": 1.7701, "grad_norm": 0.3709148168563843, "learning_rate": 0.0002, "epoch": 1.686143572621035, "step": 1010}, {"loss": 1.7637, "grad_norm": 0.3731614947319031, "learning_rate": 0.0002, "epoch": 1.702838063439065, "step": 1020}, {"loss": 1.741, "grad_norm": 0.3639261722564697, "learning_rate": 0.0002, "epoch": 1.7195325542570952, "step": 1030}, {"loss": 1.727, "grad_norm": 0.36371079087257385, "learning_rate": 0.0002, "epoch": 1.7362270450751254, "step": 1040}, {"loss": 1.7275, "grad_norm": 0.38235539197921753, "learning_rate": 0.0002, "epoch": 1.7529215358931554, "step": 1050}, {"loss": 1.7304, "grad_norm": 0.4109364151954651, "learning_rate": 0.0002, "epoch": 1.7696160267111853, "step": 1060}, {"loss": 1.7244, "grad_norm": 0.3499647378921509, "learning_rate": 0.0002, "epoch": 1.7863105175292153, "step": 1070}, {"loss": 1.7117, "grad_norm": 0.3892260193824768, "learning_rate": 0.0002, "epoch": 1.8030050083472453, "step": 1080}, {"loss": 1.7764, "grad_norm": 0.3545094132423401, "learning_rate": 0.0002, "epoch": 1.8196994991652755, "step": 1090}, {"loss": 1.87, "grad_norm": 0.40419837832450867, "learning_rate": 0.0002, "epoch": 1.8363939899833055, "step": 1100}, {"loss": 1.6783, "grad_norm": 0.38423678278923035, "learning_rate": 0.0002, "epoch": 1.8530884808013357, "step": 1110}, {"loss": 1.7714, "grad_norm": 0.378408282995224, "learning_rate": 0.0002, "epoch": 1.8697829716193657, "step": 1120}, {"loss": 1.7461, "grad_norm": 0.4071941077709198, "learning_rate": 0.0002, "epoch": 1.8864774624373957, "step": 1130}, {"loss": 1.7281, "grad_norm": 0.42363739013671875, "learning_rate": 0.0002, "epoch": 1.9031719532554257, "step": 1140}, {"loss": 1.819, "grad_norm": 0.37373560667037964, "learning_rate": 0.0002, "epoch": 1.9198664440734556, "step": 1150}, {"loss": 1.7847, "grad_norm": 0.36408767104148865, "learning_rate": 0.0002, "epoch": 1.9365609348914858, "step": 1160}, {"loss": 1.6725, "grad_norm": 0.3795453906059265, "learning_rate": 0.0002, "epoch": 1.9532554257095158, "step": 1170}, {"loss": 1.7726, "grad_norm": 0.34415504336357117, "learning_rate": 0.0002, "epoch": 1.969949916527546, "step": 1180}, {"loss": 1.6199, "grad_norm": 0.3491021394729614, "learning_rate": 0.0002, "epoch": 1.986644407345576, "step": 1190}, {"eval_loss": 1.8182536363601685, "eval_runtime": 87.8767, "eval_samples_per_second": 5.86, "eval_steps_per_second": 0.74, "epoch": 2.0, "step": 1198}, {"loss": 1.5857, "grad_norm": 0.36758512258529663, "learning_rate": 0.0002, "epoch": 2.003338898163606, "step": 1200}, {"loss": 1.61, "grad_norm": 0.36278557777404785, "learning_rate": 0.0002, "epoch": 2.020033388981636, "step": 1210}, {"loss": 1.6733, "grad_norm": 0.4186977744102478, "learning_rate": 0.0002, "epoch": 2.036727879799666, "step": 1220}, {"loss": 1.6977, "grad_norm": 0.3958706855773926, "learning_rate": 0.0002, "epoch": 2.053422370617696, "step": 1230}, {"loss": 1.7054, "grad_norm": 0.43305638432502747, "learning_rate": 0.0002, "epoch": 2.0701168614357264, "step": 1240}, {"loss": 1.6275, "grad_norm": 0.4509678781032562, "learning_rate": 0.0002, "epoch": 2.0868113522537564, "step": 1250}, {"loss": 1.6814, "grad_norm": 0.4297264516353607, "learning_rate": 0.0002, "epoch": 2.1035058430717863, "step": 1260}, {"loss": 1.6121, "grad_norm": 0.4579504132270813, "learning_rate": 0.0002, "epoch": 2.1202003338898163, "step": 1270}, {"loss": 1.6207, "grad_norm": 0.4223267138004303, "learning_rate": 0.0002, "epoch": 2.1368948247078463, "step": 1280}, {"loss": 1.6472, "grad_norm": 0.41538703441619873, "learning_rate": 0.0002, "epoch": 2.1535893155258763, "step": 1290}, {"loss": 1.6274, "grad_norm": 0.4987374544143677, "learning_rate": 0.0002, "epoch": 2.1702838063439067, "step": 1300}, {"loss": 1.6555, "grad_norm": 0.45300114154815674, "learning_rate": 0.0002, "epoch": 2.1869782971619367, "step": 1310}, {"loss": 1.6486, "grad_norm": 0.4577588737010956, "learning_rate": 0.0002, "epoch": 2.2036727879799667, "step": 1320}, {"loss": 1.644, "grad_norm": 0.4110747277736664, "learning_rate": 0.0002, "epoch": 2.2203672787979967, "step": 1330}, {"loss": 1.5875, "grad_norm": 0.5107163190841675, "learning_rate": 0.0002, "epoch": 2.2370617696160267, "step": 1340}, {"loss": 1.691, "grad_norm": 0.41190820932388306, "learning_rate": 0.0002, "epoch": 2.2537562604340566, "step": 1350}, {"loss": 1.5745, "grad_norm": 0.47458386421203613, "learning_rate": 0.0002, "epoch": 2.2704507512520866, "step": 1360}, {"loss": 1.6964, "grad_norm": 0.42136940360069275, "learning_rate": 0.0002, "epoch": 2.287145242070117, "step": 1370}, {"loss": 1.657, "grad_norm": 0.48292383551597595, "learning_rate": 0.0002, "epoch": 2.303839732888147, "step": 1380}, {"loss": 1.6816, "grad_norm": 0.4519229531288147, "learning_rate": 0.0002, "epoch": 2.320534223706177, "step": 1390}, {"loss": 1.6408, "grad_norm": 0.5860922336578369, "learning_rate": 0.0002, "epoch": 2.337228714524207, "step": 1400}, {"loss": 1.5551, "grad_norm": 0.4362313747406006, "learning_rate": 0.0002, "epoch": 2.353923205342237, "step": 1410}, {"loss": 1.6763, "grad_norm": 0.46916621923446655, "learning_rate": 0.0002, "epoch": 2.370617696160267, "step": 1420}, {"loss": 1.6082, "grad_norm": 0.5249663591384888, "learning_rate": 0.0002, "epoch": 2.3873121869782974, "step": 1430}, {"loss": 1.6793, "grad_norm": 0.4764375388622284, "learning_rate": 0.0002, "epoch": 2.4040066777963274, "step": 1440}, {"loss": 1.6395, "grad_norm": 0.46573784947395325, "learning_rate": 0.0002, "epoch": 2.4207011686143574, "step": 1450}, {"loss": 1.6629, "grad_norm": 0.44539371132850647, "learning_rate": 0.0002, "epoch": 2.4373956594323873, "step": 1460}, {"loss": 1.6149, "grad_norm": 0.40925896167755127, "learning_rate": 0.0002, "epoch": 2.4540901502504173, "step": 1470}, {"loss": 1.6213, "grad_norm": 0.4431462287902832, "learning_rate": 0.0002, "epoch": 2.4707846410684473, "step": 1480}, {"loss": 1.692, "grad_norm": 0.5476022362709045, "learning_rate": 0.0002, "epoch": 2.4874791318864773, "step": 1490}, {"loss": 1.5875, "grad_norm": 0.44762539863586426, "learning_rate": 0.0002, "epoch": 2.5041736227045073, "step": 1500}, {"loss": 1.6811, "grad_norm": 0.5470041632652283, "learning_rate": 0.0002, "epoch": 2.5208681135225377, "step": 1510}, {"loss": 1.7411, "grad_norm": 0.4739997088909149, "learning_rate": 0.0002, "epoch": 2.5375626043405677, "step": 1520}, {"loss": 1.5975, "grad_norm": 0.47115322947502136, "learning_rate": 0.0002, "epoch": 2.5542570951585977, "step": 1530}, {"loss": 1.6828, "grad_norm": 0.49705708026885986, "learning_rate": 0.0002, "epoch": 2.5709515859766277, "step": 1540}, {"loss": 1.7135, "grad_norm": 0.5537301301956177, "learning_rate": 0.0002, "epoch": 2.5876460767946576, "step": 1550}, {"loss": 1.5753, "grad_norm": 0.46930626034736633, "learning_rate": 0.0002, "epoch": 2.604340567612688, "step": 1560}, {"loss": 1.6706, "grad_norm": 0.42371469736099243, "learning_rate": 0.0002, "epoch": 2.621035058430718, "step": 1570}, {"loss": 1.6193, "grad_norm": 0.49005603790283203, "learning_rate": 0.0002, "epoch": 2.637729549248748, "step": 1580}, {"loss": 1.7138, "grad_norm": 0.4646829068660736, "learning_rate": 0.0002, "epoch": 2.654424040066778, "step": 1590}, {"loss": 1.6806, "grad_norm": 0.5091238617897034, "learning_rate": 0.0002, "epoch": 2.671118530884808, "step": 1600}, {"loss": 1.65, "grad_norm": 0.4889985918998718, "learning_rate": 0.0002, "epoch": 2.687813021702838, "step": 1610}, {"loss": 1.6573, "grad_norm": 0.5128234624862671, "learning_rate": 0.0002, "epoch": 2.704507512520868, "step": 1620}, {"loss": 1.6739, "grad_norm": 0.46999186277389526, "learning_rate": 0.0002, "epoch": 2.721202003338898, "step": 1630}, {"loss": 1.6277, "grad_norm": 0.4949921667575836, "learning_rate": 0.0002, "epoch": 2.7378964941569284, "step": 1640}, {"loss": 1.5958, "grad_norm": 0.4484370946884155, "learning_rate": 0.0002, "epoch": 2.7545909849749584, "step": 1650}, {"loss": 1.6156, "grad_norm": 0.45599570870399475, "learning_rate": 0.0002, "epoch": 2.7712854757929883, "step": 1660}, {"loss": 1.6037, "grad_norm": 0.5093285441398621, "learning_rate": 0.0002, "epoch": 2.7879799666110183, "step": 1670}, {"loss": 1.6675, "grad_norm": 0.44737935066223145, "learning_rate": 0.0002, "epoch": 2.8046744574290483, "step": 1680}, {"loss": 1.6429, "grad_norm": 0.4374251365661621, "learning_rate": 0.0002, "epoch": 2.8213689482470787, "step": 1690}, {"loss": 1.5941, "grad_norm": 0.44765740633010864, "learning_rate": 0.0002, "epoch": 2.8380634390651087, "step": 1700}, {"loss": 1.6841, "grad_norm": 0.44685253500938416, "learning_rate": 0.0002, "epoch": 2.8547579298831387, "step": 1710}, {"loss": 1.6301, "grad_norm": 0.44777143001556396, "learning_rate": 0.0002, "epoch": 2.8714524207011687, "step": 1720}, {"loss": 1.6473, "grad_norm": 0.4178132712841034, "learning_rate": 0.0002, "epoch": 2.8881469115191987, "step": 1730}, {"loss": 1.6491, "grad_norm": 0.4487852156162262, "learning_rate": 0.0002, "epoch": 2.9048414023372287, "step": 1740}, {"loss": 1.6616, "grad_norm": 0.47137337923049927, "learning_rate": 0.0002, "epoch": 2.9215358931552586, "step": 1750}, {"loss": 1.6767, "grad_norm": 0.48543235659599304, "learning_rate": 0.0002, "epoch": 2.9382303839732886, "step": 1760}, {"loss": 1.7055, "grad_norm": 0.4174182116985321, "learning_rate": 0.0002, "epoch": 2.9549248747913186, "step": 1770}, {"loss": 1.5767, "grad_norm": 0.43385711312294006, "learning_rate": 0.0002, "epoch": 2.971619365609349, "step": 1780}, {"loss": 1.6215, "grad_norm": 0.474332332611084, "learning_rate": 0.0002, "epoch": 2.988313856427379, "step": 1790}, {"eval_loss": 1.8456445932388306, "eval_runtime": 87.6261, "eval_samples_per_second": 5.877, "eval_steps_per_second": 0.742, "epoch": 3.0, "step": 1797}, {"loss": 1.6224, "grad_norm": 0.40323764085769653, "learning_rate": 0.0002, "epoch": 3.005008347245409, "step": 1800}, {"loss": 1.5367, "grad_norm": 0.45069044828414917, "learning_rate": 0.0002, "epoch": 3.021702838063439, "step": 1810}, {"loss": 1.5271, "grad_norm": 0.6204925775527954, "learning_rate": 0.0002, "epoch": 3.038397328881469, "step": 1820}, {"loss": 1.5056, "grad_norm": 0.5857783555984497, "learning_rate": 0.0002, "epoch": 3.0550918196994994, "step": 1830}, {"loss": 1.5137, "grad_norm": 0.6776524782180786, "learning_rate": 0.0002, "epoch": 3.0717863105175294, "step": 1840}, {"loss": 1.5106, "grad_norm": 0.5486199855804443, "learning_rate": 0.0002, "epoch": 3.0884808013355594, "step": 1850}, {"loss": 1.414, "grad_norm": 0.5496503710746765, "learning_rate": 0.0002, "epoch": 3.1051752921535893, "step": 1860}, {"loss": 1.5181, "grad_norm": 0.5602648258209229, "learning_rate": 0.0002, "epoch": 3.1218697829716193, "step": 1870}, {"loss": 1.5406, "grad_norm": 1.0697380304336548, "learning_rate": 0.0002, "epoch": 3.1385642737896493, "step": 1880}, {"loss": 1.4889, "grad_norm": 0.6087332367897034, "learning_rate": 0.0002, "epoch": 3.1552587646076793, "step": 1890}, {"loss": 1.5219, "grad_norm": 0.5112161040306091, "learning_rate": 0.0002, "epoch": 3.1719532554257097, "step": 1900}, {"loss": 1.5139, "grad_norm": 0.6393680572509766, "learning_rate": 0.0002, "epoch": 3.1886477462437397, "step": 1910}, {"loss": 1.5337, "grad_norm": 0.7201815247535706, "learning_rate": 0.0002, "epoch": 3.2053422370617697, "step": 1920}, {"loss": 1.6055, "grad_norm": 0.5856018662452698, "learning_rate": 0.0002, "epoch": 3.2220367278797997, "step": 1930}, {"loss": 1.4791, "grad_norm": 0.581247866153717, "learning_rate": 0.0002, "epoch": 3.2387312186978297, "step": 1940}, {"loss": 1.5395, "grad_norm": 0.6055102944374084, "learning_rate": 0.0002, "epoch": 3.2554257095158596, "step": 1950}, {"loss": 1.5086, "grad_norm": 0.546894371509552, "learning_rate": 0.0002, "epoch": 3.27212020033389, "step": 1960}, {"loss": 1.5712, "grad_norm": 0.565558910369873, "learning_rate": 0.0002, "epoch": 3.28881469115192, "step": 1970}, {"loss": 1.47, "grad_norm": 1.2238883972167969, "learning_rate": 0.0002, "epoch": 3.30550918196995, "step": 1980}, {"loss": 1.4655, "grad_norm": 0.6362585425376892, "learning_rate": 0.0002, "epoch": 3.32220367278798, "step": 1990}, {"loss": 1.5157, "grad_norm": 0.6131124496459961, "learning_rate": 0.0002, "epoch": 3.33889816360601, "step": 2000}, {"loss": 1.5322, "grad_norm": 0.5181341767311096, "learning_rate": 0.0002, "epoch": 3.35559265442404, "step": 2010}, {"loss": 1.5039, "grad_norm": 0.6667609810829163, "learning_rate": 0.0002, "epoch": 3.37228714524207, "step": 2020}, {"loss": 1.5814, "grad_norm": 0.6488749980926514, "learning_rate": 0.0002, "epoch": 3.3889816360601, "step": 2030}, {"loss": 1.5226, "grad_norm": 0.5693286061286926, "learning_rate": 0.0002, "epoch": 3.4056761268781304, "step": 2040}, {"loss": 1.5121, "grad_norm": 0.6154143810272217, "learning_rate": 0.0002, "epoch": 3.4223706176961604, "step": 2050}, {"loss": 1.6033, "grad_norm": 0.6747981309890747, "learning_rate": 0.0002, "epoch": 3.4390651085141903, "step": 2060}, {"loss": 1.5857, "grad_norm": 0.5494789481163025, "learning_rate": 0.0002, "epoch": 3.4557595993322203, "step": 2070}, {"loss": 1.5223, "grad_norm": 2.481968402862549, "learning_rate": 0.0002, "epoch": 3.4724540901502503, "step": 2080}, {"loss": 1.4989, "grad_norm": 0.589784562587738, "learning_rate": 0.0002, "epoch": 3.4891485809682803, "step": 2090}, {"loss": 1.6227, "grad_norm": 0.6449820399284363, "learning_rate": 0.0002, "epoch": 3.5058430717863107, "step": 2100}, {"loss": 1.588, "grad_norm": 0.6467038989067078, "learning_rate": 0.0002, "epoch": 3.5225375626043407, "step": 2110}, {"loss": 1.5655, "grad_norm": 0.6533533334732056, "learning_rate": 0.0002, "epoch": 3.5392320534223707, "step": 2120}, {"loss": 1.6052, "grad_norm": 0.6804035902023315, "learning_rate": 0.0002, "epoch": 3.5559265442404007, "step": 2130}, {"loss": 1.5408, "grad_norm": 0.628773033618927, "learning_rate": 0.0002, "epoch": 3.5726210350584306, "step": 2140}, {"loss": 1.5487, "grad_norm": 0.6055739521980286, "learning_rate": 0.0002, "epoch": 3.5893155258764606, "step": 2150}, {"loss": 1.5305, "grad_norm": 0.6000894904136658, "learning_rate": 0.0002, "epoch": 3.6060100166944906, "step": 2160}, {"loss": 1.4742, "grad_norm": 0.5862473249435425, "learning_rate": 0.0002, "epoch": 3.6227045075125206, "step": 2170}, {"loss": 1.503, "grad_norm": 0.6547419428825378, "learning_rate": 0.0002, "epoch": 3.639398998330551, "step": 2180}, {"loss": 1.4704, "grad_norm": 0.5610318779945374, "learning_rate": 0.0002, "epoch": 3.656093489148581, "step": 2190}, {"loss": 1.4814, "grad_norm": 0.6387564539909363, "learning_rate": 0.0002, "epoch": 3.672787979966611, "step": 2200}, {"loss": 1.5356, "grad_norm": 0.6065090894699097, "learning_rate": 0.0002, "epoch": 3.689482470784641, "step": 2210}, {"loss": 1.5074, "grad_norm": 0.6266646981239319, "learning_rate": 0.0002, "epoch": 3.706176961602671, "step": 2220}, {"loss": 1.5146, "grad_norm": 0.626944363117218, "learning_rate": 0.0002, "epoch": 3.7228714524207014, "step": 2230}, {"loss": 1.5131, "grad_norm": 0.6043975949287415, "learning_rate": 0.0002, "epoch": 3.7395659432387314, "step": 2240}, {"loss": 1.5929, "grad_norm": 0.599732518196106, "learning_rate": 0.0002, "epoch": 3.7562604340567614, "step": 2250}, {"loss": 1.5236, "grad_norm": 0.6738389134407043, "learning_rate": 0.0002, "epoch": 3.7729549248747913, "step": 2260}, {"loss": 1.5003, "grad_norm": 0.5561335682868958, "learning_rate": 0.0002, "epoch": 3.7896494156928213, "step": 2270}, {"loss": 1.5013, "grad_norm": 0.6185726523399353, "learning_rate": 0.0002, "epoch": 3.8063439065108513, "step": 2280}, {"loss": 1.4996, "grad_norm": 0.6151532530784607, "learning_rate": 0.0002, "epoch": 3.8230383973288813, "step": 2290}, {"loss": 1.5453, "grad_norm": 0.5808233022689819, "learning_rate": 0.0002, "epoch": 3.8397328881469113, "step": 2300}, {"loss": 1.5223, "grad_norm": 0.6615163683891296, "learning_rate": 0.0002, "epoch": 3.8564273789649417, "step": 2310}, {"loss": 1.4365, "grad_norm": 0.5832979679107666, "learning_rate": 0.0002, "epoch": 3.8731218697829717, "step": 2320}, {"loss": 1.6036, "grad_norm": 0.6119300127029419, "learning_rate": 0.0002, "epoch": 3.8898163606010017, "step": 2330}, {"loss": 1.5581, "grad_norm": 0.6489697694778442, "learning_rate": 0.0002, "epoch": 3.9065108514190316, "step": 2340}, {"loss": 1.5601, "grad_norm": 0.5539063215255737, "learning_rate": 0.0002, "epoch": 3.9232053422370616, "step": 2350}, {"loss": 1.5174, "grad_norm": 0.6062877178192139, "learning_rate": 0.0002, "epoch": 3.939899833055092, "step": 2360}, {"loss": 1.5168, "grad_norm": 0.680609941482544, "learning_rate": 0.0002, "epoch": 3.956594323873122, "step": 2370}, {"loss": 1.4875, "grad_norm": 0.6176834106445312, "learning_rate": 0.0002, "epoch": 3.973288814691152, "step": 2380}, {"loss": 1.4984, "grad_norm": 0.6538102030754089, "learning_rate": 0.0002, "epoch": 3.989983305509182, "step": 2390}, {"eval_loss": 1.8920671939849854, "eval_runtime": 76.5227, "eval_samples_per_second": 6.73, "eval_steps_per_second": 0.849, "epoch": 4.0, "step": 2396}, {"loss": 1.3926, "grad_norm": 0.5683762431144714, "learning_rate": 0.0002, "epoch": 4.006677796327212, "step": 2400}, {"loss": 1.3387, "grad_norm": 0.6858044862747192, "learning_rate": 0.0002, "epoch": 4.023372287145242, "step": 2410}, {"loss": 1.4495, "grad_norm": 0.7614858150482178, "learning_rate": 0.0002, "epoch": 4.040066777963272, "step": 2420}, {"loss": 1.2696, "grad_norm": 0.709412693977356, "learning_rate": 0.0002, "epoch": 4.056761268781302, "step": 2430}, {"loss": 1.3836, "grad_norm": 0.7070785760879517, "learning_rate": 0.0002, "epoch": 4.073455759599332, "step": 2440}, {"loss": 1.3527, "grad_norm": 0.8815216422080994, "learning_rate": 0.0002, "epoch": 4.090150250417362, "step": 2450}, {"loss": 1.3731, "grad_norm": 0.759981632232666, "learning_rate": 0.0002, "epoch": 4.106844741235392, "step": 2460}, {"loss": 1.3393, "grad_norm": 0.6715240478515625, "learning_rate": 0.0002, "epoch": 4.123539232053423, "step": 2470}, {"loss": 1.3934, "grad_norm": 0.7503564953804016, "learning_rate": 0.0002, "epoch": 4.140233722871453, "step": 2480}, {"loss": 1.324, "grad_norm": 0.773743748664856, "learning_rate": 0.0002, "epoch": 4.156928213689483, "step": 2490}, {"loss": 1.3782, "grad_norm": 0.8850100040435791, "learning_rate": 0.0002, "epoch": 4.173622704507513, "step": 2500}, {"loss": 1.3183, "grad_norm": 0.7575962543487549, "learning_rate": 0.0002, "epoch": 4.190317195325543, "step": 2510}, {"loss": 1.3673, "grad_norm": 0.9117498397827148, "learning_rate": 0.0002, "epoch": 4.207011686143573, "step": 2520}, {"loss": 1.3242, "grad_norm": 0.7637559175491333, "learning_rate": 0.0002, "epoch": 4.223706176961603, "step": 2530}, {"loss": 1.3764, "grad_norm": 0.8178390264511108, "learning_rate": 0.0002, "epoch": 4.240400667779633, "step": 2540}, {"loss": 1.3808, "grad_norm": 0.8299263119697571, "learning_rate": 0.0002, "epoch": 4.257095158597663, "step": 2550}, {"loss": 1.3637, "grad_norm": 0.7238091230392456, "learning_rate": 0.0002, "epoch": 4.273789649415693, "step": 2560}, {"loss": 1.349, "grad_norm": 0.7468036413192749, "learning_rate": 0.0002, "epoch": 4.290484140233723, "step": 2570}, {"loss": 1.4422, "grad_norm": 0.8012791275978088, "learning_rate": 0.0002, "epoch": 4.307178631051753, "step": 2580}, {"loss": 1.3723, "grad_norm": 0.8302484154701233, "learning_rate": 0.0002, "epoch": 4.323873121869783, "step": 2590}, {"loss": 1.4013, "grad_norm": 0.751864492893219, "learning_rate": 0.0002, "epoch": 4.340567612687813, "step": 2600}, {"loss": 1.3881, "grad_norm": 0.8025410175323486, "learning_rate": 0.0002, "epoch": 4.357262103505843, "step": 2610}, {"loss": 1.3831, "grad_norm": 0.7700799703598022, "learning_rate": 0.0002, "epoch": 4.373956594323873, "step": 2620}, {"loss": 1.3721, "grad_norm": 0.8526890873908997, "learning_rate": 0.0002, "epoch": 4.390651085141903, "step": 2630}, {"loss": 1.4253, "grad_norm": 1.0536625385284424, "learning_rate": 0.0002, "epoch": 4.407345575959933, "step": 2640}, {"loss": 1.3736, "grad_norm": 0.7223818898200989, "learning_rate": 0.0002, "epoch": 4.424040066777963, "step": 2650}, {"loss": 1.4652, "grad_norm": 0.7981253266334534, "learning_rate": 0.0002, "epoch": 4.440734557595993, "step": 2660}, {"loss": 1.3878, "grad_norm": 0.7136162519454956, "learning_rate": 0.0002, "epoch": 4.457429048414023, "step": 2670}, {"loss": 1.4242, "grad_norm": 0.8008312582969666, "learning_rate": 0.0002, "epoch": 4.474123539232053, "step": 2680}, {"loss": 1.3448, "grad_norm": 0.7924065589904785, "learning_rate": 0.0002, "epoch": 4.490818030050083, "step": 2690}, {"loss": 1.402, "grad_norm": 0.8224287629127502, "learning_rate": 0.0002, "epoch": 4.507512520868113, "step": 2700}, {"loss": 1.2841, "grad_norm": 0.7494375109672546, "learning_rate": 0.0002, "epoch": 4.524207011686143, "step": 2710}, {"loss": 1.4471, "grad_norm": 0.8097899556159973, "learning_rate": 0.0002, "epoch": 4.540901502504173, "step": 2720}, {"loss": 1.4116, "grad_norm": 0.7728819251060486, "learning_rate": 0.0002, "epoch": 4.557595993322204, "step": 2730}, {"loss": 1.3549, "grad_norm": 0.9112362265586853, "learning_rate": 0.0002, "epoch": 4.574290484140234, "step": 2740}, {"loss": 1.4601, "grad_norm": 0.7502672076225281, "learning_rate": 0.0002, "epoch": 4.590984974958264, "step": 2750}, {"loss": 1.4216, "grad_norm": 0.8816406726837158, "learning_rate": 0.0002, "epoch": 4.607679465776294, "step": 2760}, {"loss": 1.3233, "grad_norm": 0.7117180228233337, "learning_rate": 0.0002, "epoch": 4.624373956594324, "step": 2770}, {"loss": 1.3886, "grad_norm": 0.8224529027938843, "learning_rate": 0.0002, "epoch": 4.641068447412354, "step": 2780}, {"loss": 1.3756, "grad_norm": 0.7625266313552856, "learning_rate": 0.0002, "epoch": 4.657762938230384, "step": 2790}, {"loss": 1.3953, "grad_norm": 0.7754318118095398, "learning_rate": 0.0002, "epoch": 4.674457429048414, "step": 2800}, {"loss": 1.4102, "grad_norm": 0.7907336354255676, "learning_rate": 0.0002, "epoch": 4.691151919866444, "step": 2810}, {"loss": 1.3277, "grad_norm": 0.7377734780311584, "learning_rate": 0.0002, "epoch": 4.707846410684474, "step": 2820}, {"loss": 1.3686, "grad_norm": 0.7380456328392029, "learning_rate": 0.0002, "epoch": 4.724540901502504, "step": 2830}, {"loss": 1.4405, "grad_norm": 0.7148023247718811, "learning_rate": 0.0002, "epoch": 4.741235392320534, "step": 2840}, {"loss": 1.4025, "grad_norm": 0.807048499584198, "learning_rate": 0.0002, "epoch": 4.757929883138564, "step": 2850}, {"loss": 1.3195, "grad_norm": 0.8444154858589172, "learning_rate": 0.0002, "epoch": 4.774624373956595, "step": 2860}, {"loss": 1.4282, "grad_norm": 0.8328704237937927, "learning_rate": 0.0002, "epoch": 4.791318864774624, "step": 2870}, {"loss": 1.413, "grad_norm": 0.89827960729599, "learning_rate": 0.0002, "epoch": 4.808013355592655, "step": 2880}, {"loss": 1.4488, "grad_norm": 0.7848225831985474, "learning_rate": 0.0002, "epoch": 4.824707846410685, "step": 2890}, {"loss": 1.3757, "grad_norm": 0.703802227973938, "learning_rate": 0.0002, "epoch": 4.841402337228715, "step": 2900}, {"loss": 1.4404, "grad_norm": 0.8092581629753113, "learning_rate": 0.0002, "epoch": 4.858096828046745, "step": 2910}, {"loss": 1.3812, "grad_norm": 0.7537722587585449, "learning_rate": 0.0002, "epoch": 4.874791318864775, "step": 2920}, {"loss": 1.4499, "grad_norm": 0.7966470122337341, "learning_rate": 0.0002, "epoch": 4.891485809682805, "step": 2930}, {"loss": 1.3922, "grad_norm": 0.7860329747200012, "learning_rate": 0.0002, "epoch": 4.908180300500835, "step": 2940}, {"loss": 1.4224, "grad_norm": 0.7964439988136292, "learning_rate": 0.0002, "epoch": 4.924874791318865, "step": 2950}, {"loss": 1.3869, "grad_norm": 0.740288257598877, "learning_rate": 0.0002, "epoch": 4.941569282136895, "step": 2960}, {"loss": 1.4321, "grad_norm": 0.7377685904502869, "learning_rate": 0.0002, "epoch": 4.958263772954925, "step": 2970}, {"loss": 1.4253, "grad_norm": 0.793484628200531, "learning_rate": 0.0002, "epoch": 4.974958263772955, "step": 2980}, {"loss": 1.3966, "grad_norm": 0.7710573077201843, "learning_rate": 0.0002, "epoch": 4.9916527545909855, "step": 2990}, {"eval_loss": 1.9764225482940674, "eval_runtime": 87.968, "eval_samples_per_second": 5.854, "eval_steps_per_second": 0.739, "epoch": 5.0, "step": 2995}, {"loss": 1.3493, "grad_norm": 0.680841326713562, "learning_rate": 0.0002, "epoch": 5.008347245409015, "step": 3000}, {"loss": 1.2462, "grad_norm": 0.8790825009346008, "learning_rate": 0.0002, "epoch": 5.025041736227045, "step": 3010}, {"loss": 1.2514, "grad_norm": 1.1519404649734497, "learning_rate": 0.0002, "epoch": 5.041736227045075, "step": 3020}, {"loss": 1.224, "grad_norm": 1.1939337253570557, "learning_rate": 0.0002, "epoch": 5.058430717863105, "step": 3030}, {"loss": 1.1274, "grad_norm": 1.1471049785614014, "learning_rate": 0.0002, "epoch": 5.075125208681135, "step": 3040}, {"loss": 1.1726, "grad_norm": 1.0808285474777222, "learning_rate": 0.0002, "epoch": 5.091819699499165, "step": 3050}, {"loss": 1.1644, "grad_norm": 1.0102492570877075, "learning_rate": 0.0002, "epoch": 5.108514190317195, "step": 3060}, {"loss": 1.1652, "grad_norm": 0.9869397282600403, "learning_rate": 0.0002, "epoch": 5.125208681135225, "step": 3070}, {"loss": 1.1997, "grad_norm": 0.9689525365829468, "learning_rate": 0.0002, "epoch": 5.141903171953255, "step": 3080}, {"loss": 1.1747, "grad_norm": 0.9293769598007202, "learning_rate": 0.0002, "epoch": 5.158597662771285, "step": 3090}, {"loss": 1.1728, "grad_norm": 0.9289103150367737, "learning_rate": 0.0002, "epoch": 5.175292153589315, "step": 3100}, {"loss": 1.2538, "grad_norm": 0.9736173152923584, "learning_rate": 0.0002, "epoch": 5.191986644407345, "step": 3110}, {"loss": 1.2429, "grad_norm": 1.3144289255142212, "learning_rate": 0.0002, "epoch": 5.208681135225375, "step": 3120}, {"loss": 1.2107, "grad_norm": 0.95982825756073, "learning_rate": 0.0002, "epoch": 5.225375626043405, "step": 3130}, {"loss": 1.2239, "grad_norm": 0.903189480304718, "learning_rate": 0.0002, "epoch": 5.242070116861436, "step": 3140}, {"loss": 1.2663, "grad_norm": 1.056692123413086, "learning_rate": 0.0002, "epoch": 5.258764607679466, "step": 3150}, {"loss": 1.2955, "grad_norm": 1.1169359683990479, "learning_rate": 0.0002, "epoch": 5.275459098497496, "step": 3160}, {"loss": 1.1559, "grad_norm": 1.2178374528884888, "learning_rate": 0.0002, "epoch": 5.292153589315526, "step": 3170}, {"loss": 1.2394, "grad_norm": 0.9956373572349548, "learning_rate": 0.0002, "epoch": 5.308848080133556, "step": 3180}, {"loss": 1.1792, "grad_norm": 0.959555447101593, "learning_rate": 0.0002, "epoch": 5.325542570951586, "step": 3190}, {"loss": 1.1817, "grad_norm": 0.9343846440315247, "learning_rate": 0.0002, "epoch": 5.342237061769616, "step": 3200}, {"loss": 1.2033, "grad_norm": 0.8806524872779846, "learning_rate": 0.0002, "epoch": 5.358931552587646, "step": 3210}, {"loss": 1.2511, "grad_norm": 0.9477803111076355, "learning_rate": 0.0002, "epoch": 5.375626043405676, "step": 3220}, {"loss": 1.2011, "grad_norm": 0.9975674152374268, "learning_rate": 0.0002, "epoch": 5.392320534223706, "step": 3230}, {"loss": 1.3012, "grad_norm": 0.9650071263313293, "learning_rate": 0.0002, "epoch": 5.409015025041736, "step": 3240}, {"loss": 1.2281, "grad_norm": 1.0170838832855225, "learning_rate": 0.0002, "epoch": 5.425709515859766, "step": 3250}, {"loss": 1.2635, "grad_norm": 1.158118486404419, "learning_rate": 0.0002, "epoch": 5.442404006677796, "step": 3260}, {"loss": 1.3333, "grad_norm": 1.0228497982025146, "learning_rate": 0.0002, "epoch": 5.459098497495827, "step": 3270}, {"loss": 1.1961, "grad_norm": 1.0101768970489502, "learning_rate": 0.0002, "epoch": 5.475792988313857, "step": 3280}, {"loss": 1.3058, "grad_norm": 1.0407295227050781, "learning_rate": 0.0002, "epoch": 5.492487479131887, "step": 3290}, {"loss": 1.2062, "grad_norm": 0.9337932467460632, "learning_rate": 0.0002, "epoch": 5.509181969949917, "step": 3300}, {"loss": 1.2241, "grad_norm": 1.0305527448654175, "learning_rate": 0.0002, "epoch": 5.525876460767947, "step": 3310}, {"loss": 1.2524, "grad_norm": 1.0523453950881958, "learning_rate": 0.0002, "epoch": 5.542570951585977, "step": 3320}, {"loss": 1.2526, "grad_norm": 0.9707391858100891, "learning_rate": 0.0002, "epoch": 5.559265442404007, "step": 3330}, {"loss": 1.3002, "grad_norm": 1.0054972171783447, "learning_rate": 0.0002, "epoch": 5.575959933222037, "step": 3340}, {"loss": 1.2459, "grad_norm": 1.0393340587615967, "learning_rate": 0.0002, "epoch": 5.592654424040067, "step": 3350}, {"loss": 1.2328, "grad_norm": 1.0671277046203613, "learning_rate": 0.0002, "epoch": 5.609348914858097, "step": 3360}, {"loss": 1.2415, "grad_norm": 1.0725873708724976, "learning_rate": 0.0002, "epoch": 5.626043405676127, "step": 3370}, {"loss": 1.2475, "grad_norm": 0.9844746589660645, "learning_rate": 0.0002, "epoch": 5.642737896494157, "step": 3380}, {"loss": 1.1997, "grad_norm": 0.9659736752510071, "learning_rate": 0.0002, "epoch": 5.659432387312187, "step": 3390}, {"loss": 1.2426, "grad_norm": 0.9152608513832092, "learning_rate": 0.0002, "epoch": 5.676126878130217, "step": 3400}, {"loss": 1.2424, "grad_norm": 0.9759509563446045, "learning_rate": 0.0002, "epoch": 5.692821368948247, "step": 3410}, {"loss": 1.2264, "grad_norm": 1.0662057399749756, "learning_rate": 0.0002, "epoch": 5.709515859766277, "step": 3420}, {"loss": 1.19, "grad_norm": 0.9780185222625732, "learning_rate": 0.0002, "epoch": 5.726210350584307, "step": 3430}, {"loss": 1.2603, "grad_norm": 0.9781617522239685, "learning_rate": 0.0002, "epoch": 5.742904841402337, "step": 3440}, {"loss": 1.2472, "grad_norm": 1.0790785551071167, "learning_rate": 0.0002, "epoch": 5.759599332220367, "step": 3450}, {"loss": 1.2697, "grad_norm": 1.0573410987854004, "learning_rate": 0.0002, "epoch": 5.776293823038397, "step": 3460}, {"loss": 1.2591, "grad_norm": 0.9953364729881287, "learning_rate": 0.0002, "epoch": 5.792988313856427, "step": 3470}, {"loss": 1.2361, "grad_norm": 1.0072667598724365, "learning_rate": 0.0002, "epoch": 5.809682804674457, "step": 3480}, {"loss": 1.286, "grad_norm": 0.9312750697135925, "learning_rate": 0.0002, "epoch": 5.826377295492487, "step": 3490}, {"loss": 1.2379, "grad_norm": 1.059614896774292, "learning_rate": 0.0002, "epoch": 5.843071786310517, "step": 3500}, {"loss": 1.2323, "grad_norm": 1.2089484930038452, "learning_rate": 0.0002, "epoch": 5.859766277128547, "step": 3510}, {"loss": 1.2047, "grad_norm": 1.0740607976913452, "learning_rate": 0.0002, "epoch": 5.876460767946577, "step": 3520}, {"loss": 1.2809, "grad_norm": 0.9620149731636047, "learning_rate": 0.0002, "epoch": 5.893155258764608, "step": 3530}, {"loss": 1.238, "grad_norm": 1.0482431650161743, "learning_rate": 0.0002, "epoch": 5.909849749582638, "step": 3540}, {"loss": 1.2621, "grad_norm": 0.9137503504753113, "learning_rate": 0.0002, "epoch": 5.926544240400668, "step": 3550}, {"loss": 1.3066, "grad_norm": 1.1599403619766235, "learning_rate": 0.0002, "epoch": 5.943238731218698, "step": 3560}, {"loss": 1.2556, "grad_norm": 0.911613404750824, "learning_rate": 0.0002, "epoch": 5.959933222036728, "step": 3570}, {"loss": 1.2746, "grad_norm": 0.9120033383369446, "learning_rate": 0.0002, "epoch": 5.976627712854758, "step": 3580}, {"loss": 1.2815, "grad_norm": 1.0588736534118652, "learning_rate": 0.0002, "epoch": 5.993322203672788, "step": 3590}, {"eval_loss": 2.0921614170074463, "eval_runtime": 71.974, "eval_samples_per_second": 7.155, "eval_steps_per_second": 0.903, "epoch": 6.0, "step": 3594}, {"loss": 1.1397, "grad_norm": 0.9213348627090454, "learning_rate": 0.0002, "epoch": 6.010016694490818, "step": 3600}, {"loss": 1.07, "grad_norm": 1.137640357017517, "learning_rate": 0.0002, "epoch": 6.026711185308848, "step": 3610}, {"loss": 0.9953, "grad_norm": 1.200276494026184, "learning_rate": 0.0002, "epoch": 6.043405676126878, "step": 3620}, {"loss": 1.0356, "grad_norm": 1.335649013519287, "learning_rate": 0.0002, "epoch": 6.060100166944908, "step": 3630}, {"loss": 1.1154, "grad_norm": 1.1353906393051147, "learning_rate": 0.0002, "epoch": 6.076794657762938, "step": 3640}, {"loss": 1.0481, "grad_norm": 1.0406795740127563, "learning_rate": 0.0002, "epoch": 6.093489148580968, "step": 3650}, {"loss": 1.0594, "grad_norm": 1.2691017389297485, "learning_rate": 0.0002, "epoch": 6.110183639398999, "step": 3660}, {"loss": 1.0594, "grad_norm": 1.3334898948669434, "learning_rate": 0.0002, "epoch": 6.126878130217029, "step": 3670}, {"loss": 1.0186, "grad_norm": 1.1766020059585571, "learning_rate": 0.0002, "epoch": 6.143572621035059, "step": 3680}, {"loss": 1.0431, "grad_norm": 1.1079157590866089, "learning_rate": 0.0002, "epoch": 6.160267111853089, "step": 3690}, {"loss": 1.0395, "grad_norm": 1.4312299489974976, "learning_rate": 0.0002, "epoch": 6.176961602671119, "step": 3700}, {"loss": 1.1095, "grad_norm": 1.2636224031448364, "learning_rate": 0.0002, "epoch": 6.193656093489149, "step": 3710}, {"loss": 1.0669, "grad_norm": 1.1957253217697144, "learning_rate": 0.0002, "epoch": 6.210350584307179, "step": 3720}, {"loss": 1.0199, "grad_norm": 1.1044131517410278, "learning_rate": 0.0002, "epoch": 6.227045075125209, "step": 3730}, {"loss": 1.0316, "grad_norm": 1.2045193910598755, "learning_rate": 0.0002, "epoch": 6.243739565943239, "step": 3740}, {"loss": 1.1218, "grad_norm": 1.0740957260131836, "learning_rate": 0.0002, "epoch": 6.260434056761269, "step": 3750}, {"loss": 1.0271, "grad_norm": 1.1548833847045898, "learning_rate": 0.0002, "epoch": 6.277128547579299, "step": 3760}, {"loss": 1.14, "grad_norm": 1.257440209388733, "learning_rate": 0.0002, "epoch": 6.293823038397329, "step": 3770}, {"loss": 1.0762, "grad_norm": 1.1988940238952637, "learning_rate": 0.0002, "epoch": 6.310517529215359, "step": 3780}, {"loss": 1.0627, "grad_norm": 1.1707229614257812, "learning_rate": 0.0002, "epoch": 6.3272120200333895, "step": 3790}, {"loss": 1.053, "grad_norm": 1.360107660293579, "learning_rate": 0.0002, "epoch": 6.343906510851419, "step": 3800}, {"loss": 1.0637, "grad_norm": 1.249742031097412, "learning_rate": 0.0002, "epoch": 6.360601001669449, "step": 3810}, {"loss": 1.0521, "grad_norm": 1.2729560136795044, "learning_rate": 0.0002, "epoch": 6.377295492487479, "step": 3820}, {"loss": 1.1217, "grad_norm": 1.241761565208435, "learning_rate": 0.0002, "epoch": 6.393989983305509, "step": 3830}, {"loss": 1.0648, "grad_norm": 1.1892873048782349, "learning_rate": 0.0002, "epoch": 6.410684474123539, "step": 3840}, {"loss": 1.1092, "grad_norm": 1.1766357421875, "learning_rate": 0.0002, "epoch": 6.427378964941569, "step": 3850}, {"loss": 1.0872, "grad_norm": 1.2642168998718262, "learning_rate": 0.0002, "epoch": 6.444073455759599, "step": 3860}, {"loss": 1.0748, "grad_norm": 1.3390182256698608, "learning_rate": 0.0002, "epoch": 6.460767946577629, "step": 3870}, {"loss": 1.0657, "grad_norm": 1.183168649673462, "learning_rate": 0.0002, "epoch": 6.477462437395659, "step": 3880}, {"loss": 1.0696, "grad_norm": 1.1458892822265625, "learning_rate": 0.0002, "epoch": 6.494156928213689, "step": 3890}, {"loss": 1.1625, "grad_norm": 1.2736095190048218, "learning_rate": 0.0002, "epoch": 6.510851419031719, "step": 3900}, {"loss": 1.1175, "grad_norm": 1.323607087135315, "learning_rate": 0.0002, "epoch": 6.527545909849749, "step": 3910}, {"loss": 1.1258, "grad_norm": 1.2177817821502686, "learning_rate": 0.0002, "epoch": 6.54424040066778, "step": 3920}, {"loss": 1.0333, "grad_norm": 1.3270750045776367, "learning_rate": 0.0002, "epoch": 6.560934891485809, "step": 3930}, {"loss": 1.0589, "grad_norm": 1.0974372625350952, "learning_rate": 0.0002, "epoch": 6.57762938230384, "step": 3940}, {"loss": 1.1347, "grad_norm": 1.3352670669555664, "learning_rate": 0.0002, "epoch": 6.59432387312187, "step": 3950}, {"loss": 1.0684, "grad_norm": 1.3174126148223877, "learning_rate": 0.0002, "epoch": 6.6110183639399, "step": 3960}, {"loss": 1.1697, "grad_norm": 1.1783626079559326, "learning_rate": 0.0002, "epoch": 6.62771285475793, "step": 3970}, {"loss": 1.1256, "grad_norm": 1.1886446475982666, "learning_rate": 0.0002, "epoch": 6.64440734557596, "step": 3980}, {"loss": 1.1066, "grad_norm": 1.2215187549591064, "learning_rate": 0.0002, "epoch": 6.66110183639399, "step": 3990}, {"loss": 1.1236, "grad_norm": 1.0320725440979004, "learning_rate": 0.0002, "epoch": 6.67779632721202, "step": 4000}, {"loss": 1.0828, "grad_norm": 1.340338110923767, "learning_rate": 0.0002, "epoch": 6.69449081803005, "step": 4010}, {"loss": 1.0942, "grad_norm": 1.1496273279190063, "learning_rate": 0.0002, "epoch": 6.71118530884808, "step": 4020}, {"loss": 1.1465, "grad_norm": 1.5720409154891968, "learning_rate": 0.0002, "epoch": 6.72787979966611, "step": 4030}, {"loss": 1.1385, "grad_norm": 1.497376799583435, "learning_rate": 0.0002, "epoch": 6.74457429048414, "step": 4040}, {"loss": 1.0808, "grad_norm": 1.1594456434249878, "learning_rate": 0.0002, "epoch": 6.76126878130217, "step": 4050}, {"loss": 1.1541, "grad_norm": 1.326546549797058, "learning_rate": 0.0002, "epoch": 6.7779632721202, "step": 4060}, {"loss": 1.1314, "grad_norm": 1.18723726272583, "learning_rate": 0.0002, "epoch": 6.794657762938231, "step": 4070}, {"loss": 1.1906, "grad_norm": 1.2974154949188232, "learning_rate": 0.0002, "epoch": 6.811352253756261, "step": 4080}, {"loss": 1.0534, "grad_norm": 1.207748532295227, "learning_rate": 0.0002, "epoch": 6.828046744574291, "step": 4090}, {"loss": 1.0951, "grad_norm": 1.2398537397384644, "learning_rate": 0.0002, "epoch": 6.844741235392321, "step": 4100}, {"loss": 1.1348, "grad_norm": 1.1657508611679077, "learning_rate": 0.0002, "epoch": 6.861435726210351, "step": 4110}, {"loss": 1.1315, "grad_norm": 1.1986382007598877, "learning_rate": 0.0002, "epoch": 6.878130217028381, "step": 4120}, {"loss": 1.0781, "grad_norm": 1.407080054283142, "learning_rate": 0.0002, "epoch": 6.894824707846411, "step": 4130}, {"loss": 1.0515, "grad_norm": 1.0725297927856445, "learning_rate": 0.0002, "epoch": 6.911519198664441, "step": 4140}, {"loss": 1.1602, "grad_norm": 1.2659991979599, "learning_rate": 0.0002, "epoch": 6.928213689482471, "step": 4150}, {"loss": 1.1373, "grad_norm": 1.0579404830932617, "learning_rate": 0.0002, "epoch": 6.944908180300501, "step": 4160}, {"loss": 1.1441, "grad_norm": 1.254502296447754, "learning_rate": 0.0002, "epoch": 6.961602671118531, "step": 4170}, {"loss": 1.1019, "grad_norm": 1.2666021585464478, "learning_rate": 0.0002, "epoch": 6.978297161936561, "step": 4180}, {"loss": 1.0675, "grad_norm": 1.236793041229248, "learning_rate": 0.0002, "epoch": 6.994991652754591, "step": 4190}, {"eval_loss": 2.211871862411499, "eval_runtime": 56.9215, "eval_samples_per_second": 9.048, "eval_steps_per_second": 1.142, "epoch": 7.0, "step": 4193}, {"loss": 1.0092, "grad_norm": 1.8114486932754517, "learning_rate": 0.0002, "epoch": 7.011686143572621, "step": 4200}, {"loss": 0.8505, "grad_norm": 2.062814235687256, "learning_rate": 0.0002, "epoch": 7.028380634390651, "step": 4210}, {"loss": 0.8606, "grad_norm": 1.4835841655731201, "learning_rate": 0.0002, "epoch": 7.045075125208681, "step": 4220}, {"loss": 0.862, "grad_norm": 1.3040175437927246, "learning_rate": 0.0002, "epoch": 7.061769616026711, "step": 4230}, {"loss": 0.9513, "grad_norm": 1.3654398918151855, "learning_rate": 0.0002, "epoch": 7.078464106844741, "step": 4240}, {"loss": 0.9272, "grad_norm": 1.3989132642745972, "learning_rate": 0.0002, "epoch": 7.095158597662771, "step": 4250}, {"loss": 0.9062, "grad_norm": 1.2168488502502441, "learning_rate": 0.0002, "epoch": 7.111853088480801, "step": 4260}, {"loss": 0.8792, "grad_norm": 1.52049720287323, "learning_rate": 0.0002, "epoch": 7.128547579298831, "step": 4270}, {"loss": 0.8486, "grad_norm": 1.4944370985031128, "learning_rate": 0.0002, "epoch": 7.145242070116861, "step": 4280}, {"loss": 0.8757, "grad_norm": 1.4657515287399292, "learning_rate": 0.0002, "epoch": 7.161936560934891, "step": 4290}, {"loss": 0.9209, "grad_norm": 1.373306155204773, "learning_rate": 0.0002, "epoch": 7.178631051752921, "step": 4300}, {"loss": 0.964, "grad_norm": 1.3957229852676392, "learning_rate": 0.0002, "epoch": 7.195325542570951, "step": 4310}, {"loss": 0.8777, "grad_norm": 1.3072983026504517, "learning_rate": 0.0002, "epoch": 7.212020033388981, "step": 4320}, {"loss": 0.8515, "grad_norm": 1.3311468362808228, "learning_rate": 0.0002, "epoch": 7.228714524207012, "step": 4330}, {"loss": 0.9641, "grad_norm": 1.3969240188598633, "learning_rate": 0.0002, "epoch": 7.245409015025042, "step": 4340}, {"loss": 0.8767, "grad_norm": 1.496384859085083, "learning_rate": 0.0002, "epoch": 7.262103505843072, "step": 4350}, {"loss": 0.9378, "grad_norm": 1.38449227809906, "learning_rate": 0.0002, "epoch": 7.278797996661102, "step": 4360}, {"loss": 0.9299, "grad_norm": 1.397478699684143, "learning_rate": 0.0002, "epoch": 7.295492487479132, "step": 4370}, {"loss": 0.9067, "grad_norm": 1.234455943107605, "learning_rate": 0.0002, "epoch": 7.312186978297162, "step": 4380}, {"loss": 0.9761, "grad_norm": 1.3813334703445435, "learning_rate": 0.0002, "epoch": 7.328881469115192, "step": 4390}, {"loss": 0.8766, "grad_norm": 1.3944685459136963, "learning_rate": 0.0002, "epoch": 7.345575959933222, "step": 4400}, {"loss": 0.9164, "grad_norm": 1.5999382734298706, "learning_rate": 0.0002, "epoch": 7.362270450751252, "step": 4410}, {"loss": 0.9286, "grad_norm": 1.753442406654358, "learning_rate": 0.0002, "epoch": 7.378964941569282, "step": 4420}, {"loss": 0.9248, "grad_norm": 1.4564250707626343, "learning_rate": 0.0002, "epoch": 7.395659432387312, "step": 4430}, {"loss": 0.9011, "grad_norm": 1.488957166671753, "learning_rate": 0.0002, "epoch": 7.412353923205342, "step": 4440}, {"loss": 0.9268, "grad_norm": 1.5810562372207642, "learning_rate": 0.0002, "epoch": 7.429048414023372, "step": 4450}, {"loss": 0.9033, "grad_norm": 1.2961808443069458, "learning_rate": 0.0002, "epoch": 7.445742904841403, "step": 4460}, {"loss": 0.951, "grad_norm": 1.4854587316513062, "learning_rate": 0.0002, "epoch": 7.462437395659433, "step": 4470}, {"loss": 0.9627, "grad_norm": 1.5555771589279175, "learning_rate": 0.0002, "epoch": 7.479131886477463, "step": 4480}, {"loss": 0.952, "grad_norm": 1.5276654958724976, "learning_rate": 0.0002, "epoch": 7.495826377295493, "step": 4490}, {"loss": 0.9679, "grad_norm": 1.4847941398620605, "learning_rate": 0.0002, "epoch": 7.512520868113523, "step": 4500}, {"loss": 0.9613, "grad_norm": 1.4122779369354248, "learning_rate": 0.0002, "epoch": 7.529215358931553, "step": 4510}, {"loss": 0.9882, "grad_norm": 1.497211217880249, "learning_rate": 0.0002, "epoch": 7.545909849749583, "step": 4520}, {"loss": 0.9778, "grad_norm": 1.4892537593841553, "learning_rate": 0.0002, "epoch": 7.562604340567613, "step": 4530}, {"loss": 0.9743, "grad_norm": 1.2664510011672974, "learning_rate": 0.0002, "epoch": 7.579298831385643, "step": 4540}, {"loss": 0.9311, "grad_norm": 1.4286391735076904, "learning_rate": 0.0002, "epoch": 7.595993322203673, "step": 4550}, {"loss": 0.995, "grad_norm": 1.4727665185928345, "learning_rate": 0.0002, "epoch": 7.612687813021703, "step": 4560}, {"loss": 0.8821, "grad_norm": 1.4128608703613281, "learning_rate": 0.0002, "epoch": 7.629382303839733, "step": 4570}, {"loss": 0.9227, "grad_norm": 1.4077776670455933, "learning_rate": 0.0002, "epoch": 7.646076794657763, "step": 4580}, {"loss": 0.9552, "grad_norm": 1.760135293006897, "learning_rate": 0.0002, "epoch": 7.6627712854757934, "step": 4590}, {"loss": 0.9896, "grad_norm": 1.450317144393921, "learning_rate": 0.0002, "epoch": 7.6794657762938225, "step": 4600}, {"loss": 0.9701, "grad_norm": 1.445032000541687, "learning_rate": 0.0002, "epoch": 7.696160267111853, "step": 4610}, {"loss": 0.975, "grad_norm": 1.3218955993652344, "learning_rate": 0.0002, "epoch": 7.712854757929883, "step": 4620}, {"loss": 0.9947, "grad_norm": 1.3336185216903687, "learning_rate": 0.0002, "epoch": 7.729549248747913, "step": 4630}, {"loss": 0.9918, "grad_norm": 1.3436596393585205, "learning_rate": 0.0002, "epoch": 7.746243739565943, "step": 4640}, {"loss": 1.0155, "grad_norm": 1.4396946430206299, "learning_rate": 0.0002, "epoch": 7.762938230383973, "step": 4650}, {"loss": 0.9928, "grad_norm": 1.5268234014511108, "learning_rate": 0.0002, "epoch": 7.779632721202003, "step": 4660}, {"loss": 0.9871, "grad_norm": 1.3981901407241821, "learning_rate": 0.0002, "epoch": 7.796327212020033, "step": 4670}, {"loss": 0.9414, "grad_norm": 1.6962796449661255, "learning_rate": 0.0002, "epoch": 7.813021702838063, "step": 4680}, {"loss": 0.9576, "grad_norm": 1.4803595542907715, "learning_rate": 0.0002, "epoch": 7.829716193656093, "step": 4690}, {"loss": 0.9772, "grad_norm": 1.4438055753707886, "learning_rate": 0.0002, "epoch": 7.846410684474123, "step": 4700}, {"loss": 0.961, "grad_norm": 1.2435152530670166, "learning_rate": 0.0002, "epoch": 7.863105175292153, "step": 4710}, {"loss": 1.0079, "grad_norm": 1.7456961870193481, "learning_rate": 0.0002, "epoch": 7.879799666110184, "step": 4720}, {"loss": 0.9359, "grad_norm": 1.1902697086334229, "learning_rate": 0.0002, "epoch": 7.896494156928213, "step": 4730}, {"loss": 1.0037, "grad_norm": 1.5772660970687866, "learning_rate": 0.0002, "epoch": 7.913188647746244, "step": 4740}, {"loss": 1.0633, "grad_norm": 1.593420386314392, "learning_rate": 0.0002, "epoch": 7.929883138564274, "step": 4750}, {"loss": 1.0801, "grad_norm": 1.3951916694641113, "learning_rate": 0.0002, "epoch": 7.946577629382304, "step": 4760}, {"loss": 0.9775, "grad_norm": 1.2561997175216675, "learning_rate": 0.0002, "epoch": 7.963272120200334, "step": 4770}, {"loss": 0.9279, "grad_norm": 1.3175349235534668, "learning_rate": 0.0002, "epoch": 7.979966611018364, "step": 4780}, {"loss": 1.0438, "grad_norm": 1.4960309267044067, "learning_rate": 0.0002, "epoch": 7.996661101836394, "step": 4790}]}