diff --git a/.gitattributes b/.gitattributes index 2068b7be3e3eb2d2189f8ba412b1a95f3cb82935..3d20020bcc2cf011c4c3f783211317cf8d61ec01 100644 --- a/.gitattributes +++ b/.gitattributes @@ -213,3 +213,12 @@ Meta-Llama-3-8B-Instruct_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512 Meta-Llama-3-8B-Instruct_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-177-sd-1/checkpoint-73/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-177-sd-1/checkpoint-80/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_winogrande-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-177-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..95e7a4cae71b69e4eaf9b054a8e34c181b140016 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22dcc0f054d062272ed31bf92071323178f08e6ec87762c3a87f14ad6069784f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..95e7a4cae71b69e4eaf9b054a8e34c181b140016 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22dcc0f054d062272ed31bf92071323178f08e6ec87762c3a87f14ad6069784f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a39bb964fce44346590ce7e63755d87bb0288ae3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535a073506281e1eefb5cacccc9cba0f6b2199b0d98180b3e77a727f5c4867b4 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a7faf8c452a1924b827d5ff026d26835b8f4c79 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50dab4e129971e0e79e45360713facc6f8554139f5f916bd7717b107da9ed95b +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8097214ad963dccf8a308f66b25c96b339fa163b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df6959f19b564ff58381bbd14c6430271d7cbf32aec3c0081509e7210bca5f71 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..89e9bef65eaebbd3009fdc7d8b6d32c13f2d8faa --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/trainer_state.json @@ -0,0 +1,1120 @@ +{ + "best_metric": 1.820037841796875, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1536, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 0.303970068693161, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 770 + }, + { + "epoch": 1.015625, + "grad_norm": 0.32745876908302307, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 780 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 0.33467888832092285, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.38253068923950195, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 800 + }, + { + "epoch": 1.0546875, + "grad_norm": 0.3955802023410797, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 810 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.3534117043018341, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 820 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 0.33427858352661133, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 830 + }, + { + "epoch": 1.09375, + "grad_norm": 0.35261571407318115, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 0.4416263997554779, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 850 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.3918050229549408, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 860 + }, + { + "epoch": 1.1328125, + "grad_norm": 0.38482677936553955, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 870 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.4945143759250641, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 880 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 0.429677814245224, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 890 + }, + { + "epoch": 1.171875, + "grad_norm": 0.41878288984298706, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 900 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 0.41578373312950134, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 910 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.37028902769088745, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 920 + }, + { + "epoch": 1.2109375, + "grad_norm": 0.3824995756149292, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 930 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.3818865418434143, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 940 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 0.3930460810661316, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 950 + }, + { + "epoch": 1.25, + "grad_norm": 0.3904426395893097, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 960 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 0.4175802171230316, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 970 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.42343786358833313, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 980 + }, + { + "epoch": 1.2890625, + "grad_norm": 0.4168420135974884, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 990 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.38692983984947205, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1000 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 0.5037692189216614, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 1010 + }, + { + "epoch": 1.328125, + "grad_norm": 0.39436691999435425, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 1020 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 0.3431943356990814, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1030 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.39167070388793945, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 1040 + }, + { + "epoch": 1.3671875, + "grad_norm": 0.3820446729660034, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1050 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.4190749526023865, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1060 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 0.3618869185447693, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1070 + }, + { + "epoch": 1.40625, + "grad_norm": 0.38852423429489136, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1080 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 0.49829256534576416, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 1090 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.3956700563430786, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 1100 + }, + { + "epoch": 1.4453125, + "grad_norm": 0.38829147815704346, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1110 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.37237483263015747, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 1120 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 0.39798808097839355, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 1.484375, + "grad_norm": 0.38188642263412476, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 1140 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 0.44961944222450256, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1150 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.3816550374031067, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 1160 + }, + { + "epoch": 1.5234375, + "grad_norm": 0.3885478973388672, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1170 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.42779695987701416, + "learning_rate": 0.0002, + "loss": 1.7285, + "step": 1180 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 0.41499748826026917, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 1190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4319412410259247, + "learning_rate": 0.0002, + "loss": 1.6569, + "step": 1200 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 0.38847389817237854, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 1210 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.45832890272140503, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 1220 + }, + { + "epoch": 1.6015625, + "grad_norm": 0.45928797125816345, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 1230 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.4052276611328125, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 0.4031650424003601, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 1250 + }, + { + "epoch": 1.640625, + "grad_norm": 0.36724114418029785, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1260 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 0.4188505709171295, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 1270 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3982168138027191, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 1280 + }, + { + "epoch": 1.6796875, + "grad_norm": 0.3768596053123474, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1290 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.3843287527561188, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1300 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 0.3982345461845398, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1310 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3407546281814575, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 1320 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 0.36327359080314636, + "learning_rate": 0.0002, + "loss": 1.7316, + "step": 1330 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.4141675531864166, + "learning_rate": 0.0002, + "loss": 1.734, + "step": 1340 + }, + { + "epoch": 1.7578125, + "grad_norm": 0.43894267082214355, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1350 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.40564292669296265, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 1360 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 0.3978462815284729, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1370 + }, + { + "epoch": 1.796875, + "grad_norm": 0.37140771746635437, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1380 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 0.43164145946502686, + "learning_rate": 0.0002, + "loss": 1.742, + "step": 1390 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.38034674525260925, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1400 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1410 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.37417489290237427, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1420 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 0.4303789734840393, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1430 + }, + { + "epoch": 1.875, + "grad_norm": 0.43942129611968994, + "learning_rate": 0.0002, + "loss": 1.6489, + "step": 1440 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 0.3866581320762634, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 1450 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3686903417110443, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1460 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.3885461986064911, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 1470 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.4156927466392517, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1480 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 0.3934236168861389, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 1490 + }, + { + "epoch": 1.953125, + "grad_norm": 0.38645586371421814, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 1500 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 0.43272635340690613, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1510 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.42476025223731995, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1520 + }, + { + "epoch": 1.9921875, + "grad_norm": 0.37216147780418396, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1530 + }, + { + "epoch": 2.0, + "eval_loss": 1.820037841796875, + "eval_runtime": 101.0456, + "eval_samples_per_second": 5.097, + "eval_steps_per_second": 0.643, + "step": 1536 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.108264075670323e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..091781c56558781e9b4a5b4287f2500da94f950b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:186be21fc0fbc2631f52500085fab9cb6d3b4a7a25d0087840ef5e7a731af8df +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfa4325ac5d1ca1e40434f21e05d810a5689ffaf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:825de6c3bbcfd2d43dedf17de63d41ddd3e6dbcb26e5e61d75abefb568060e65 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..07395c4f25e082179aaa94fc22ebb8fa8250eb8d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f3c8cb81dab6c4ad3616863e39342e9ea3b3c0302fc932089924ac5d611c20e +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..615aad041af6bfd8b1df14073bd4e3c188ab20e6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:975fca589b2ce7868825cf45fbcf13cf82836abd061bc0255aa355b3ba58bd5a +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b9abd00a871f324f1d74fbfc2bf3449e1e16b979 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/trainer_state.json @@ -0,0 +1,1667 @@ +{ + "best_metric": 1.820037841796875, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 2304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 0.303970068693161, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 770 + }, + { + "epoch": 1.015625, + "grad_norm": 0.32745876908302307, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 780 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 0.33467888832092285, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.38253068923950195, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 800 + }, + { + "epoch": 1.0546875, + "grad_norm": 0.3955802023410797, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 810 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.3534117043018341, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 820 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 0.33427858352661133, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 830 + }, + { + "epoch": 1.09375, + "grad_norm": 0.35261571407318115, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 0.4416263997554779, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 850 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.3918050229549408, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 860 + }, + { + "epoch": 1.1328125, + "grad_norm": 0.38482677936553955, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 870 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.4945143759250641, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 880 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 0.429677814245224, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 890 + }, + { + "epoch": 1.171875, + "grad_norm": 0.41878288984298706, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 900 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 0.41578373312950134, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 910 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.37028902769088745, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 920 + }, + { + "epoch": 1.2109375, + "grad_norm": 0.3824995756149292, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 930 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.3818865418434143, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 940 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 0.3930460810661316, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 950 + }, + { + "epoch": 1.25, + "grad_norm": 0.3904426395893097, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 960 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 0.4175802171230316, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 970 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.42343786358833313, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 980 + }, + { + "epoch": 1.2890625, + "grad_norm": 0.4168420135974884, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 990 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.38692983984947205, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1000 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 0.5037692189216614, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 1010 + }, + { + "epoch": 1.328125, + "grad_norm": 0.39436691999435425, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 1020 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 0.3431943356990814, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1030 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.39167070388793945, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 1040 + }, + { + "epoch": 1.3671875, + "grad_norm": 0.3820446729660034, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1050 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.4190749526023865, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1060 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 0.3618869185447693, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1070 + }, + { + "epoch": 1.40625, + "grad_norm": 0.38852423429489136, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1080 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 0.49829256534576416, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 1090 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.3956700563430786, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 1100 + }, + { + "epoch": 1.4453125, + "grad_norm": 0.38829147815704346, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1110 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.37237483263015747, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 1120 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 0.39798808097839355, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 1.484375, + "grad_norm": 0.38188642263412476, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 1140 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 0.44961944222450256, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1150 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.3816550374031067, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 1160 + }, + { + "epoch": 1.5234375, + "grad_norm": 0.3885478973388672, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1170 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.42779695987701416, + "learning_rate": 0.0002, + "loss": 1.7285, + "step": 1180 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 0.41499748826026917, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 1190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4319412410259247, + "learning_rate": 0.0002, + "loss": 1.6569, + "step": 1200 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 0.38847389817237854, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 1210 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.45832890272140503, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 1220 + }, + { + "epoch": 1.6015625, + "grad_norm": 0.45928797125816345, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 1230 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.4052276611328125, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 0.4031650424003601, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 1250 + }, + { + "epoch": 1.640625, + "grad_norm": 0.36724114418029785, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1260 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 0.4188505709171295, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 1270 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3982168138027191, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 1280 + }, + { + "epoch": 1.6796875, + "grad_norm": 0.3768596053123474, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1290 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.3843287527561188, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1300 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 0.3982345461845398, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1310 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3407546281814575, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 1320 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 0.36327359080314636, + "learning_rate": 0.0002, + "loss": 1.7316, + "step": 1330 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.4141675531864166, + "learning_rate": 0.0002, + "loss": 1.734, + "step": 1340 + }, + { + "epoch": 1.7578125, + "grad_norm": 0.43894267082214355, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1350 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.40564292669296265, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 1360 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 0.3978462815284729, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1370 + }, + { + "epoch": 1.796875, + "grad_norm": 0.37140771746635437, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1380 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 0.43164145946502686, + "learning_rate": 0.0002, + "loss": 1.742, + "step": 1390 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.38034674525260925, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1400 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1410 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.37417489290237427, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1420 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 0.4303789734840393, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1430 + }, + { + "epoch": 1.875, + "grad_norm": 0.43942129611968994, + "learning_rate": 0.0002, + "loss": 1.6489, + "step": 1440 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 0.3866581320762634, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 1450 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3686903417110443, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1460 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.3885461986064911, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 1470 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.4156927466392517, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1480 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 0.3934236168861389, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 1490 + }, + { + "epoch": 1.953125, + "grad_norm": 0.38645586371421814, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 1500 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 0.43272635340690613, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1510 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.42476025223731995, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1520 + }, + { + "epoch": 1.9921875, + "grad_norm": 0.37216147780418396, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1530 + }, + { + "epoch": 2.0, + "eval_loss": 1.820037841796875, + "eval_runtime": 101.0456, + "eval_samples_per_second": 5.097, + "eval_steps_per_second": 0.643, + "step": 1536 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 0.39003029465675354, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1540 + }, + { + "epoch": 2.0182291666666665, + "grad_norm": 0.4302637577056885, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 1550 + }, + { + "epoch": 2.03125, + "grad_norm": 0.4496043026447296, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1560 + }, + { + "epoch": 2.0442708333333335, + "grad_norm": 0.42824679613113403, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1570 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 0.44775739312171936, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 1580 + }, + { + "epoch": 2.0703125, + "grad_norm": 0.4705299735069275, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1590 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.4614814817905426, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1600 + }, + { + "epoch": 2.0963541666666665, + "grad_norm": 0.45097213983535767, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1610 + }, + { + "epoch": 2.109375, + "grad_norm": 0.41954323649406433, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 1620 + }, + { + "epoch": 2.1223958333333335, + "grad_norm": 0.44894352555274963, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 1630 + }, + { + "epoch": 2.1354166666666665, + "grad_norm": 0.4421502947807312, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1640 + }, + { + "epoch": 2.1484375, + "grad_norm": 0.44649967551231384, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1650 + }, + { + "epoch": 2.1614583333333335, + "grad_norm": 0.44216716289520264, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 1660 + }, + { + "epoch": 2.1744791666666665, + "grad_norm": 0.6363232135772705, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 1670 + }, + { + "epoch": 2.1875, + "grad_norm": 0.46533334255218506, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1680 + }, + { + "epoch": 2.2005208333333335, + "grad_norm": 0.48486822843551636, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 1690 + }, + { + "epoch": 2.2135416666666665, + "grad_norm": 0.43277066946029663, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 1700 + }, + { + "epoch": 2.2265625, + "grad_norm": 0.45927226543426514, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 1710 + }, + { + "epoch": 2.2395833333333335, + "grad_norm": 0.4654010236263275, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 1720 + }, + { + "epoch": 2.2526041666666665, + "grad_norm": 0.49796584248542786, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 1730 + }, + { + "epoch": 2.265625, + "grad_norm": 0.4506736397743225, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1740 + }, + { + "epoch": 2.2786458333333335, + "grad_norm": 0.46757954359054565, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1750 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.4507335424423218, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 1760 + }, + { + "epoch": 2.3046875, + "grad_norm": 0.43900197744369507, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1770 + }, + { + "epoch": 2.3177083333333335, + "grad_norm": 0.48013004660606384, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1780 + }, + { + "epoch": 2.3307291666666665, + "grad_norm": 0.41891220211982727, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1790 + }, + { + "epoch": 2.34375, + "grad_norm": 0.4879191219806671, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1800 + }, + { + "epoch": 2.3567708333333335, + "grad_norm": 0.46148231625556946, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1810 + }, + { + "epoch": 2.3697916666666665, + "grad_norm": 0.5114223957061768, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 1820 + }, + { + "epoch": 2.3828125, + "grad_norm": 0.4828612804412842, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 1830 + }, + { + "epoch": 2.3958333333333335, + "grad_norm": 0.4672335386276245, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1840 + }, + { + "epoch": 2.4088541666666665, + "grad_norm": 0.4914792776107788, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1850 + }, + { + "epoch": 2.421875, + "grad_norm": 0.44478079676628113, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 1860 + }, + { + "epoch": 2.4348958333333335, + "grad_norm": 0.4601325988769531, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 1870 + }, + { + "epoch": 2.4479166666666665, + "grad_norm": 0.44539815187454224, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 1880 + }, + { + "epoch": 2.4609375, + "grad_norm": 0.4532422125339508, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 1890 + }, + { + "epoch": 2.4739583333333335, + "grad_norm": 0.5323562622070312, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 1900 + }, + { + "epoch": 2.4869791666666665, + "grad_norm": 0.5027516484260559, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1910 + }, + { + "epoch": 2.5, + "grad_norm": 0.4507808983325958, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 1920 + }, + { + "epoch": 2.5130208333333335, + "grad_norm": 0.4996422827243805, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1930 + }, + { + "epoch": 2.5260416666666665, + "grad_norm": 0.4964800179004669, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 1940 + }, + { + "epoch": 2.5390625, + "grad_norm": 0.48546481132507324, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 1950 + }, + { + "epoch": 2.5520833333333335, + "grad_norm": 0.47357916831970215, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1960 + }, + { + "epoch": 2.5651041666666665, + "grad_norm": 0.47136595845222473, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 1970 + }, + { + "epoch": 2.578125, + "grad_norm": 0.5185502171516418, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 1980 + }, + { + "epoch": 2.5911458333333335, + "grad_norm": 0.47995880246162415, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 1990 + }, + { + "epoch": 2.6041666666666665, + "grad_norm": 0.5076674222946167, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2000 + }, + { + "epoch": 2.6171875, + "grad_norm": 0.4805421233177185, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2010 + }, + { + "epoch": 2.6302083333333335, + "grad_norm": 0.4406864047050476, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2020 + }, + { + "epoch": 2.6432291666666665, + "grad_norm": 0.521388828754425, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2030 + }, + { + "epoch": 2.65625, + "grad_norm": 0.4531918466091156, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 2040 + }, + { + "epoch": 2.6692708333333335, + "grad_norm": 0.45295774936676025, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2050 + }, + { + "epoch": 2.6822916666666665, + "grad_norm": 0.4573723375797272, + "learning_rate": 0.0002, + "loss": 1.5252, + "step": 2060 + }, + { + "epoch": 2.6953125, + "grad_norm": 0.4836064279079437, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2070 + }, + { + "epoch": 2.7083333333333335, + "grad_norm": 0.5040885210037231, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2080 + }, + { + "epoch": 2.7213541666666665, + "grad_norm": 0.5153458118438721, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2090 + }, + { + "epoch": 2.734375, + "grad_norm": 0.4415692090988159, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2100 + }, + { + "epoch": 2.7473958333333335, + "grad_norm": 0.4862712621688843, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2110 + }, + { + "epoch": 2.7604166666666665, + "grad_norm": 0.4845922589302063, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2120 + }, + { + "epoch": 2.7734375, + "grad_norm": 0.5153566598892212, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 2130 + }, + { + "epoch": 2.7864583333333335, + "grad_norm": 0.4220491945743561, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 2140 + }, + { + "epoch": 2.7994791666666665, + "grad_norm": 0.523292064666748, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 2150 + }, + { + "epoch": 2.8125, + "grad_norm": 0.4567972421646118, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2160 + }, + { + "epoch": 2.8255208333333335, + "grad_norm": 0.6252557039260864, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2170 + }, + { + "epoch": 2.8385416666666665, + "grad_norm": 0.5231373310089111, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 2180 + }, + { + "epoch": 2.8515625, + "grad_norm": 0.49243974685668945, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 2190 + }, + { + "epoch": 2.8645833333333335, + "grad_norm": 0.521644115447998, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2200 + }, + { + "epoch": 2.8776041666666665, + "grad_norm": 0.4624195694923401, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2210 + }, + { + "epoch": 2.890625, + "grad_norm": 0.4463620185852051, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2220 + }, + { + "epoch": 2.9036458333333335, + "grad_norm": 0.45793524384498596, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 2230 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.46979188919067383, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2240 + }, + { + "epoch": 2.9296875, + "grad_norm": 0.5220303535461426, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2250 + }, + { + "epoch": 2.9427083333333335, + "grad_norm": 0.44405895471572876, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2260 + }, + { + "epoch": 2.9557291666666665, + "grad_norm": 0.523841381072998, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 2270 + }, + { + "epoch": 2.96875, + "grad_norm": 0.4928138852119446, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2280 + }, + { + "epoch": 2.9817708333333335, + "grad_norm": 0.4918071925640106, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 2290 + }, + { + "epoch": 2.9947916666666665, + "grad_norm": 0.4584912061691284, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2300 + }, + { + "epoch": 3.0, + "eval_loss": 1.8474308252334595, + "eval_runtime": 103.7697, + "eval_samples_per_second": 4.963, + "eval_steps_per_second": 0.626, + "step": 2304 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0662396113505485e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-2304/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8833456fd6f866ac8f545b6972284bb330b28799 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd138f72c72c818a363284307bf794e28c5301b57673fdb1d0623216b8ee7061 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0ebdb66a2996a7c456746c40cf74ff9b7fae825 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d600c43b2a240f14bb82f038e53858e5f34fa157550900fef3d5f01cad092d79 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee09d43099b2b75bfc4f253ae2e11fa9188a55b7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0445be17be2300c427e1c86495cb2f34cf18d8f7d045b934131a07c7daaa467 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3b6e61ff25fbe77f56a6339d58ecd30db5f8857 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06e561a105598a909bc99956e3a8e551ac80fd51eb9d26a0749f535e1bd57622 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e3c7d7c899142b2eb061588e23c0ff0e9d1d187 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/trainer_state.json @@ -0,0 +1,2214 @@ +{ + "best_metric": 1.820037841796875, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 3072, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 0.303970068693161, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 770 + }, + { + "epoch": 1.015625, + "grad_norm": 0.32745876908302307, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 780 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 0.33467888832092285, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.38253068923950195, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 800 + }, + { + "epoch": 1.0546875, + "grad_norm": 0.3955802023410797, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 810 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.3534117043018341, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 820 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 0.33427858352661133, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 830 + }, + { + "epoch": 1.09375, + "grad_norm": 0.35261571407318115, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 0.4416263997554779, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 850 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.3918050229549408, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 860 + }, + { + "epoch": 1.1328125, + "grad_norm": 0.38482677936553955, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 870 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.4945143759250641, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 880 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 0.429677814245224, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 890 + }, + { + "epoch": 1.171875, + "grad_norm": 0.41878288984298706, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 900 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 0.41578373312950134, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 910 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.37028902769088745, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 920 + }, + { + "epoch": 1.2109375, + "grad_norm": 0.3824995756149292, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 930 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.3818865418434143, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 940 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 0.3930460810661316, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 950 + }, + { + "epoch": 1.25, + "grad_norm": 0.3904426395893097, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 960 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 0.4175802171230316, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 970 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.42343786358833313, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 980 + }, + { + "epoch": 1.2890625, + "grad_norm": 0.4168420135974884, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 990 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.38692983984947205, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1000 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 0.5037692189216614, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 1010 + }, + { + "epoch": 1.328125, + "grad_norm": 0.39436691999435425, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 1020 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 0.3431943356990814, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1030 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.39167070388793945, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 1040 + }, + { + "epoch": 1.3671875, + "grad_norm": 0.3820446729660034, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1050 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.4190749526023865, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1060 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 0.3618869185447693, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1070 + }, + { + "epoch": 1.40625, + "grad_norm": 0.38852423429489136, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1080 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 0.49829256534576416, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 1090 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.3956700563430786, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 1100 + }, + { + "epoch": 1.4453125, + "grad_norm": 0.38829147815704346, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1110 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.37237483263015747, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 1120 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 0.39798808097839355, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 1.484375, + "grad_norm": 0.38188642263412476, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 1140 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 0.44961944222450256, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1150 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.3816550374031067, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 1160 + }, + { + "epoch": 1.5234375, + "grad_norm": 0.3885478973388672, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1170 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.42779695987701416, + "learning_rate": 0.0002, + "loss": 1.7285, + "step": 1180 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 0.41499748826026917, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 1190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4319412410259247, + "learning_rate": 0.0002, + "loss": 1.6569, + "step": 1200 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 0.38847389817237854, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 1210 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.45832890272140503, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 1220 + }, + { + "epoch": 1.6015625, + "grad_norm": 0.45928797125816345, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 1230 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.4052276611328125, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 0.4031650424003601, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 1250 + }, + { + "epoch": 1.640625, + "grad_norm": 0.36724114418029785, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1260 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 0.4188505709171295, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 1270 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3982168138027191, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 1280 + }, + { + "epoch": 1.6796875, + "grad_norm": 0.3768596053123474, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1290 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.3843287527561188, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1300 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 0.3982345461845398, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1310 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3407546281814575, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 1320 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 0.36327359080314636, + "learning_rate": 0.0002, + "loss": 1.7316, + "step": 1330 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.4141675531864166, + "learning_rate": 0.0002, + "loss": 1.734, + "step": 1340 + }, + { + "epoch": 1.7578125, + "grad_norm": 0.43894267082214355, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1350 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.40564292669296265, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 1360 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 0.3978462815284729, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1370 + }, + { + "epoch": 1.796875, + "grad_norm": 0.37140771746635437, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1380 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 0.43164145946502686, + "learning_rate": 0.0002, + "loss": 1.742, + "step": 1390 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.38034674525260925, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1400 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1410 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.37417489290237427, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1420 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 0.4303789734840393, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1430 + }, + { + "epoch": 1.875, + "grad_norm": 0.43942129611968994, + "learning_rate": 0.0002, + "loss": 1.6489, + "step": 1440 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 0.3866581320762634, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 1450 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3686903417110443, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1460 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.3885461986064911, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 1470 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.4156927466392517, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1480 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 0.3934236168861389, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 1490 + }, + { + "epoch": 1.953125, + "grad_norm": 0.38645586371421814, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 1500 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 0.43272635340690613, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1510 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.42476025223731995, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1520 + }, + { + "epoch": 1.9921875, + "grad_norm": 0.37216147780418396, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1530 + }, + { + "epoch": 2.0, + "eval_loss": 1.820037841796875, + "eval_runtime": 101.0456, + "eval_samples_per_second": 5.097, + "eval_steps_per_second": 0.643, + "step": 1536 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 0.39003029465675354, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1540 + }, + { + "epoch": 2.0182291666666665, + "grad_norm": 0.4302637577056885, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 1550 + }, + { + "epoch": 2.03125, + "grad_norm": 0.4496043026447296, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1560 + }, + { + "epoch": 2.0442708333333335, + "grad_norm": 0.42824679613113403, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1570 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 0.44775739312171936, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 1580 + }, + { + "epoch": 2.0703125, + "grad_norm": 0.4705299735069275, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1590 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.4614814817905426, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1600 + }, + { + "epoch": 2.0963541666666665, + "grad_norm": 0.45097213983535767, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1610 + }, + { + "epoch": 2.109375, + "grad_norm": 0.41954323649406433, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 1620 + }, + { + "epoch": 2.1223958333333335, + "grad_norm": 0.44894352555274963, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 1630 + }, + { + "epoch": 2.1354166666666665, + "grad_norm": 0.4421502947807312, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1640 + }, + { + "epoch": 2.1484375, + "grad_norm": 0.44649967551231384, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1650 + }, + { + "epoch": 2.1614583333333335, + "grad_norm": 0.44216716289520264, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 1660 + }, + { + "epoch": 2.1744791666666665, + "grad_norm": 0.6363232135772705, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 1670 + }, + { + "epoch": 2.1875, + "grad_norm": 0.46533334255218506, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1680 + }, + { + "epoch": 2.2005208333333335, + "grad_norm": 0.48486822843551636, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 1690 + }, + { + "epoch": 2.2135416666666665, + "grad_norm": 0.43277066946029663, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 1700 + }, + { + "epoch": 2.2265625, + "grad_norm": 0.45927226543426514, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 1710 + }, + { + "epoch": 2.2395833333333335, + "grad_norm": 0.4654010236263275, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 1720 + }, + { + "epoch": 2.2526041666666665, + "grad_norm": 0.49796584248542786, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 1730 + }, + { + "epoch": 2.265625, + "grad_norm": 0.4506736397743225, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1740 + }, + { + "epoch": 2.2786458333333335, + "grad_norm": 0.46757954359054565, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1750 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.4507335424423218, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 1760 + }, + { + "epoch": 2.3046875, + "grad_norm": 0.43900197744369507, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1770 + }, + { + "epoch": 2.3177083333333335, + "grad_norm": 0.48013004660606384, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1780 + }, + { + "epoch": 2.3307291666666665, + "grad_norm": 0.41891220211982727, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1790 + }, + { + "epoch": 2.34375, + "grad_norm": 0.4879191219806671, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1800 + }, + { + "epoch": 2.3567708333333335, + "grad_norm": 0.46148231625556946, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1810 + }, + { + "epoch": 2.3697916666666665, + "grad_norm": 0.5114223957061768, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 1820 + }, + { + "epoch": 2.3828125, + "grad_norm": 0.4828612804412842, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 1830 + }, + { + "epoch": 2.3958333333333335, + "grad_norm": 0.4672335386276245, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1840 + }, + { + "epoch": 2.4088541666666665, + "grad_norm": 0.4914792776107788, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1850 + }, + { + "epoch": 2.421875, + "grad_norm": 0.44478079676628113, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 1860 + }, + { + "epoch": 2.4348958333333335, + "grad_norm": 0.4601325988769531, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 1870 + }, + { + "epoch": 2.4479166666666665, + "grad_norm": 0.44539815187454224, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 1880 + }, + { + "epoch": 2.4609375, + "grad_norm": 0.4532422125339508, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 1890 + }, + { + "epoch": 2.4739583333333335, + "grad_norm": 0.5323562622070312, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 1900 + }, + { + "epoch": 2.4869791666666665, + "grad_norm": 0.5027516484260559, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1910 + }, + { + "epoch": 2.5, + "grad_norm": 0.4507808983325958, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 1920 + }, + { + "epoch": 2.5130208333333335, + "grad_norm": 0.4996422827243805, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1930 + }, + { + "epoch": 2.5260416666666665, + "grad_norm": 0.4964800179004669, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 1940 + }, + { + "epoch": 2.5390625, + "grad_norm": 0.48546481132507324, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 1950 + }, + { + "epoch": 2.5520833333333335, + "grad_norm": 0.47357916831970215, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1960 + }, + { + "epoch": 2.5651041666666665, + "grad_norm": 0.47136595845222473, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 1970 + }, + { + "epoch": 2.578125, + "grad_norm": 0.5185502171516418, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 1980 + }, + { + "epoch": 2.5911458333333335, + "grad_norm": 0.47995880246162415, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 1990 + }, + { + "epoch": 2.6041666666666665, + "grad_norm": 0.5076674222946167, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2000 + }, + { + "epoch": 2.6171875, + "grad_norm": 0.4805421233177185, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2010 + }, + { + "epoch": 2.6302083333333335, + "grad_norm": 0.4406864047050476, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2020 + }, + { + "epoch": 2.6432291666666665, + "grad_norm": 0.521388828754425, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2030 + }, + { + "epoch": 2.65625, + "grad_norm": 0.4531918466091156, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 2040 + }, + { + "epoch": 2.6692708333333335, + "grad_norm": 0.45295774936676025, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2050 + }, + { + "epoch": 2.6822916666666665, + "grad_norm": 0.4573723375797272, + "learning_rate": 0.0002, + "loss": 1.5252, + "step": 2060 + }, + { + "epoch": 2.6953125, + "grad_norm": 0.4836064279079437, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2070 + }, + { + "epoch": 2.7083333333333335, + "grad_norm": 0.5040885210037231, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2080 + }, + { + "epoch": 2.7213541666666665, + "grad_norm": 0.5153458118438721, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2090 + }, + { + "epoch": 2.734375, + "grad_norm": 0.4415692090988159, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2100 + }, + { + "epoch": 2.7473958333333335, + "grad_norm": 0.4862712621688843, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2110 + }, + { + "epoch": 2.7604166666666665, + "grad_norm": 0.4845922589302063, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2120 + }, + { + "epoch": 2.7734375, + "grad_norm": 0.5153566598892212, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 2130 + }, + { + "epoch": 2.7864583333333335, + "grad_norm": 0.4220491945743561, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 2140 + }, + { + "epoch": 2.7994791666666665, + "grad_norm": 0.523292064666748, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 2150 + }, + { + "epoch": 2.8125, + "grad_norm": 0.4567972421646118, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2160 + }, + { + "epoch": 2.8255208333333335, + "grad_norm": 0.6252557039260864, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2170 + }, + { + "epoch": 2.8385416666666665, + "grad_norm": 0.5231373310089111, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 2180 + }, + { + "epoch": 2.8515625, + "grad_norm": 0.49243974685668945, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 2190 + }, + { + "epoch": 2.8645833333333335, + "grad_norm": 0.521644115447998, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2200 + }, + { + "epoch": 2.8776041666666665, + "grad_norm": 0.4624195694923401, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2210 + }, + { + "epoch": 2.890625, + "grad_norm": 0.4463620185852051, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2220 + }, + { + "epoch": 2.9036458333333335, + "grad_norm": 0.45793524384498596, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 2230 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.46979188919067383, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2240 + }, + { + "epoch": 2.9296875, + "grad_norm": 0.5220303535461426, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2250 + }, + { + "epoch": 2.9427083333333335, + "grad_norm": 0.44405895471572876, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2260 + }, + { + "epoch": 2.9557291666666665, + "grad_norm": 0.523841381072998, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 2270 + }, + { + "epoch": 2.96875, + "grad_norm": 0.4928138852119446, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2280 + }, + { + "epoch": 2.9817708333333335, + "grad_norm": 0.4918071925640106, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 2290 + }, + { + "epoch": 2.9947916666666665, + "grad_norm": 0.4584912061691284, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2300 + }, + { + "epoch": 3.0, + "eval_loss": 1.8474308252334595, + "eval_runtime": 103.7697, + "eval_samples_per_second": 4.963, + "eval_steps_per_second": 0.626, + "step": 2304 + }, + { + "epoch": 3.0078125, + "grad_norm": 0.4801871180534363, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 2310 + }, + { + "epoch": 3.0208333333333335, + "grad_norm": 0.5789998173713684, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 2320 + }, + { + "epoch": 3.0338541666666665, + "grad_norm": 0.49856704473495483, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2330 + }, + { + "epoch": 3.046875, + "grad_norm": 0.5625631213188171, + "learning_rate": 0.0002, + "loss": 1.4718, + "step": 2340 + }, + { + "epoch": 3.0598958333333335, + "grad_norm": 0.557637095451355, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 2350 + }, + { + "epoch": 3.0729166666666665, + "grad_norm": 0.528889536857605, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 2360 + }, + { + "epoch": 3.0859375, + "grad_norm": 0.5952284932136536, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 2370 + }, + { + "epoch": 3.0989583333333335, + "grad_norm": 0.5549899339675903, + "learning_rate": 0.0002, + "loss": 1.5304, + "step": 2380 + }, + { + "epoch": 3.1119791666666665, + "grad_norm": 0.662139892578125, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 2390 + }, + { + "epoch": 3.125, + "grad_norm": 0.5281530618667603, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 2400 + }, + { + "epoch": 3.1380208333333335, + "grad_norm": 0.6134106516838074, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 2410 + }, + { + "epoch": 3.1510416666666665, + "grad_norm": 0.6040887236595154, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2420 + }, + { + "epoch": 3.1640625, + "grad_norm": 0.549672544002533, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 2430 + }, + { + "epoch": 3.1770833333333335, + "grad_norm": 0.9195653796195984, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 2440 + }, + { + "epoch": 3.1901041666666665, + "grad_norm": 0.5578703284263611, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 2450 + }, + { + "epoch": 3.203125, + "grad_norm": 0.5982925891876221, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 2460 + }, + { + "epoch": 3.2161458333333335, + "grad_norm": 0.5544393062591553, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 2470 + }, + { + "epoch": 3.2291666666666665, + "grad_norm": 0.6015266180038452, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2480 + }, + { + "epoch": 3.2421875, + "grad_norm": 0.5995243191719055, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2490 + }, + { + "epoch": 3.2552083333333335, + "grad_norm": 0.5846129059791565, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 2500 + }, + { + "epoch": 3.2682291666666665, + "grad_norm": 0.5552570223808289, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 2510 + }, + { + "epoch": 3.28125, + "grad_norm": 0.576998233795166, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 2520 + }, + { + "epoch": 3.2942708333333335, + "grad_norm": 0.6526138186454773, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2530 + }, + { + "epoch": 3.3072916666666665, + "grad_norm": 0.6064265966415405, + "learning_rate": 0.0002, + "loss": 1.474, + "step": 2540 + }, + { + "epoch": 3.3203125, + "grad_norm": 0.5542362928390503, + "learning_rate": 0.0002, + "loss": 1.5125, + "step": 2550 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6048482060432434, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 2560 + }, + { + "epoch": 3.3463541666666665, + "grad_norm": 0.6328344941139221, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 2570 + }, + { + "epoch": 3.359375, + "grad_norm": 0.6347311735153198, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 2580 + }, + { + "epoch": 3.3723958333333335, + "grad_norm": 0.537570595741272, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 2590 + }, + { + "epoch": 3.3854166666666665, + "grad_norm": 0.5704807639122009, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 2600 + }, + { + "epoch": 3.3984375, + "grad_norm": 0.5914373993873596, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 2610 + }, + { + "epoch": 3.4114583333333335, + "grad_norm": 0.6724640130996704, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 2620 + }, + { + "epoch": 3.4244791666666665, + "grad_norm": 0.6295472383499146, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 2630 + }, + { + "epoch": 3.4375, + "grad_norm": 0.5842770934104919, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 2640 + }, + { + "epoch": 3.4505208333333335, + "grad_norm": 0.6297776699066162, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 2650 + }, + { + "epoch": 3.4635416666666665, + "grad_norm": 0.6105847358703613, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2660 + }, + { + "epoch": 3.4765625, + "grad_norm": 0.6294940710067749, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 2670 + }, + { + "epoch": 3.4895833333333335, + "grad_norm": 0.6573333740234375, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2680 + }, + { + "epoch": 3.5026041666666665, + "grad_norm": 0.663661539554596, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 2690 + }, + { + "epoch": 3.515625, + "grad_norm": 0.6729148626327515, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 2700 + }, + { + "epoch": 3.5286458333333335, + "grad_norm": 0.6633102893829346, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 2710 + }, + { + "epoch": 3.5416666666666665, + "grad_norm": 0.567686915397644, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 2720 + }, + { + "epoch": 3.5546875, + "grad_norm": 0.6281962394714355, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 2730 + }, + { + "epoch": 3.5677083333333335, + "grad_norm": 0.5710738897323608, + "learning_rate": 0.0002, + "loss": 1.5028, + "step": 2740 + }, + { + "epoch": 3.5807291666666665, + "grad_norm": 0.648162305355072, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 2750 + }, + { + "epoch": 3.59375, + "grad_norm": 0.5466254949569702, + "learning_rate": 0.0002, + "loss": 1.4294, + "step": 2760 + }, + { + "epoch": 3.6067708333333335, + "grad_norm": 0.6867973208427429, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2770 + }, + { + "epoch": 3.6197916666666665, + "grad_norm": 0.673612117767334, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2780 + }, + { + "epoch": 3.6328125, + "grad_norm": 0.6928417086601257, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2790 + }, + { + "epoch": 3.6458333333333335, + "grad_norm": 0.6603742837905884, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2800 + }, + { + "epoch": 3.6588541666666665, + "grad_norm": 0.5964401960372925, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 2810 + }, + { + "epoch": 3.671875, + "grad_norm": 0.6224474310874939, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 2820 + }, + { + "epoch": 3.6848958333333335, + "grad_norm": 0.6592439413070679, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 2830 + }, + { + "epoch": 3.6979166666666665, + "grad_norm": 0.6255369186401367, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 2840 + }, + { + "epoch": 3.7109375, + "grad_norm": 0.7136337757110596, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 2850 + }, + { + "epoch": 3.7239583333333335, + "grad_norm": 0.6229757070541382, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 2860 + }, + { + "epoch": 3.7369791666666665, + "grad_norm": 0.696080207824707, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2870 + }, + { + "epoch": 3.75, + "grad_norm": 0.571873664855957, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2880 + }, + { + "epoch": 3.7630208333333335, + "grad_norm": 0.5918916463851929, + "learning_rate": 0.0002, + "loss": 1.4093, + "step": 2890 + }, + { + "epoch": 3.7760416666666665, + "grad_norm": 0.616413950920105, + "learning_rate": 0.0002, + "loss": 1.399, + "step": 2900 + }, + { + "epoch": 3.7890625, + "grad_norm": 0.6267292499542236, + "learning_rate": 0.0002, + "loss": 1.4215, + "step": 2910 + }, + { + "epoch": 3.8020833333333335, + "grad_norm": 0.6630783677101135, + "learning_rate": 0.0002, + "loss": 1.5095, + "step": 2920 + }, + { + "epoch": 3.8151041666666665, + "grad_norm": 0.6004238724708557, + "learning_rate": 0.0002, + "loss": 1.5323, + "step": 2930 + }, + { + "epoch": 3.828125, + "grad_norm": 0.6740423440933228, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2940 + }, + { + "epoch": 3.8411458333333335, + "grad_norm": 0.6397785544395447, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 2950 + }, + { + "epoch": 3.8541666666666665, + "grad_norm": 0.6063735485076904, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2960 + }, + { + "epoch": 3.8671875, + "grad_norm": 0.6462053060531616, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 2970 + }, + { + "epoch": 3.8802083333333335, + "grad_norm": 0.7143250107765198, + "learning_rate": 0.0002, + "loss": 1.5237, + "step": 2980 + }, + { + "epoch": 3.8932291666666665, + "grad_norm": 0.6747874617576599, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2990 + }, + { + "epoch": 3.90625, + "grad_norm": 0.622930109500885, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 3000 + }, + { + "epoch": 3.9192708333333335, + "grad_norm": 0.620193600654602, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 3010 + }, + { + "epoch": 3.9322916666666665, + "grad_norm": 0.6321487426757812, + "learning_rate": 0.0002, + "loss": 1.495, + "step": 3020 + }, + { + "epoch": 3.9453125, + "grad_norm": 0.5705523490905762, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 3030 + }, + { + "epoch": 3.9583333333333335, + "grad_norm": 0.6185072660446167, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 3040 + }, + { + "epoch": 3.9713541666666665, + "grad_norm": 0.6005704998970032, + "learning_rate": 0.0002, + "loss": 1.4667, + "step": 3050 + }, + { + "epoch": 3.984375, + "grad_norm": 0.5933769941329956, + "learning_rate": 0.0002, + "loss": 1.4896, + "step": 3060 + }, + { + "epoch": 3.9973958333333335, + "grad_norm": 0.695209801197052, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 3070 + }, + { + "epoch": 4.0, + "eval_loss": 1.8955267667770386, + "eval_runtime": 103.5061, + "eval_samples_per_second": 4.976, + "eval_steps_per_second": 0.628, + "step": 3072 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4216528151340646e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3072/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c031eeb55c7ce53522a8d1e701391c955d8274f1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b76e7a52d12b09609436368e95d0fc34f0d6d3313d71883900743a5f99fb17d4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9cf96b9a41d6c26201243c35210cb77eb8849a68 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5bc373558b334dca2ebfabb2464b0184f8c5325f3bcb4b669952418c7a0e933 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1dd0e77a03ae049f1ebd49a5dd5fe8e0727259c2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d383f8f3064a1bd18016562aaef1082fad2c652ed43d49aaa6c66757374dbf0 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa467c88c42251701ae2a04305e1e5491794171f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d74ec787344ce085add2219776b675e89fe6354008252fbddbd1461c59b19be8 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..65ca314383501be712d25780fcf2aa51351844d1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/trainer_state.json @@ -0,0 +1,2761 @@ +{ + "best_metric": 1.820037841796875, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 3840, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 0.303970068693161, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 770 + }, + { + "epoch": 1.015625, + "grad_norm": 0.32745876908302307, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 780 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 0.33467888832092285, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.38253068923950195, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 800 + }, + { + "epoch": 1.0546875, + "grad_norm": 0.3955802023410797, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 810 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.3534117043018341, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 820 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 0.33427858352661133, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 830 + }, + { + "epoch": 1.09375, + "grad_norm": 0.35261571407318115, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 0.4416263997554779, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 850 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.3918050229549408, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 860 + }, + { + "epoch": 1.1328125, + "grad_norm": 0.38482677936553955, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 870 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.4945143759250641, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 880 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 0.429677814245224, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 890 + }, + { + "epoch": 1.171875, + "grad_norm": 0.41878288984298706, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 900 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 0.41578373312950134, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 910 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.37028902769088745, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 920 + }, + { + "epoch": 1.2109375, + "grad_norm": 0.3824995756149292, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 930 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.3818865418434143, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 940 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 0.3930460810661316, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 950 + }, + { + "epoch": 1.25, + "grad_norm": 0.3904426395893097, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 960 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 0.4175802171230316, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 970 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.42343786358833313, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 980 + }, + { + "epoch": 1.2890625, + "grad_norm": 0.4168420135974884, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 990 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.38692983984947205, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1000 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 0.5037692189216614, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 1010 + }, + { + "epoch": 1.328125, + "grad_norm": 0.39436691999435425, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 1020 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 0.3431943356990814, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1030 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.39167070388793945, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 1040 + }, + { + "epoch": 1.3671875, + "grad_norm": 0.3820446729660034, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1050 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.4190749526023865, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1060 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 0.3618869185447693, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1070 + }, + { + "epoch": 1.40625, + "grad_norm": 0.38852423429489136, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1080 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 0.49829256534576416, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 1090 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.3956700563430786, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 1100 + }, + { + "epoch": 1.4453125, + "grad_norm": 0.38829147815704346, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1110 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.37237483263015747, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 1120 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 0.39798808097839355, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 1.484375, + "grad_norm": 0.38188642263412476, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 1140 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 0.44961944222450256, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1150 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.3816550374031067, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 1160 + }, + { + "epoch": 1.5234375, + "grad_norm": 0.3885478973388672, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1170 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.42779695987701416, + "learning_rate": 0.0002, + "loss": 1.7285, + "step": 1180 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 0.41499748826026917, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 1190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4319412410259247, + "learning_rate": 0.0002, + "loss": 1.6569, + "step": 1200 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 0.38847389817237854, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 1210 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.45832890272140503, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 1220 + }, + { + "epoch": 1.6015625, + "grad_norm": 0.45928797125816345, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 1230 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.4052276611328125, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 0.4031650424003601, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 1250 + }, + { + "epoch": 1.640625, + "grad_norm": 0.36724114418029785, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1260 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 0.4188505709171295, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 1270 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3982168138027191, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 1280 + }, + { + "epoch": 1.6796875, + "grad_norm": 0.3768596053123474, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1290 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.3843287527561188, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1300 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 0.3982345461845398, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1310 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3407546281814575, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 1320 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 0.36327359080314636, + "learning_rate": 0.0002, + "loss": 1.7316, + "step": 1330 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.4141675531864166, + "learning_rate": 0.0002, + "loss": 1.734, + "step": 1340 + }, + { + "epoch": 1.7578125, + "grad_norm": 0.43894267082214355, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1350 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.40564292669296265, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 1360 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 0.3978462815284729, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1370 + }, + { + "epoch": 1.796875, + "grad_norm": 0.37140771746635437, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1380 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 0.43164145946502686, + "learning_rate": 0.0002, + "loss": 1.742, + "step": 1390 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.38034674525260925, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1400 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1410 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.37417489290237427, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1420 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 0.4303789734840393, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1430 + }, + { + "epoch": 1.875, + "grad_norm": 0.43942129611968994, + "learning_rate": 0.0002, + "loss": 1.6489, + "step": 1440 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 0.3866581320762634, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 1450 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3686903417110443, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1460 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.3885461986064911, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 1470 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.4156927466392517, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1480 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 0.3934236168861389, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 1490 + }, + { + "epoch": 1.953125, + "grad_norm": 0.38645586371421814, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 1500 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 0.43272635340690613, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1510 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.42476025223731995, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1520 + }, + { + "epoch": 1.9921875, + "grad_norm": 0.37216147780418396, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1530 + }, + { + "epoch": 2.0, + "eval_loss": 1.820037841796875, + "eval_runtime": 101.0456, + "eval_samples_per_second": 5.097, + "eval_steps_per_second": 0.643, + "step": 1536 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 0.39003029465675354, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1540 + }, + { + "epoch": 2.0182291666666665, + "grad_norm": 0.4302637577056885, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 1550 + }, + { + "epoch": 2.03125, + "grad_norm": 0.4496043026447296, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1560 + }, + { + "epoch": 2.0442708333333335, + "grad_norm": 0.42824679613113403, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1570 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 0.44775739312171936, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 1580 + }, + { + "epoch": 2.0703125, + "grad_norm": 0.4705299735069275, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1590 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.4614814817905426, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1600 + }, + { + "epoch": 2.0963541666666665, + "grad_norm": 0.45097213983535767, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1610 + }, + { + "epoch": 2.109375, + "grad_norm": 0.41954323649406433, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 1620 + }, + { + "epoch": 2.1223958333333335, + "grad_norm": 0.44894352555274963, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 1630 + }, + { + "epoch": 2.1354166666666665, + "grad_norm": 0.4421502947807312, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1640 + }, + { + "epoch": 2.1484375, + "grad_norm": 0.44649967551231384, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1650 + }, + { + "epoch": 2.1614583333333335, + "grad_norm": 0.44216716289520264, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 1660 + }, + { + "epoch": 2.1744791666666665, + "grad_norm": 0.6363232135772705, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 1670 + }, + { + "epoch": 2.1875, + "grad_norm": 0.46533334255218506, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1680 + }, + { + "epoch": 2.2005208333333335, + "grad_norm": 0.48486822843551636, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 1690 + }, + { + "epoch": 2.2135416666666665, + "grad_norm": 0.43277066946029663, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 1700 + }, + { + "epoch": 2.2265625, + "grad_norm": 0.45927226543426514, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 1710 + }, + { + "epoch": 2.2395833333333335, + "grad_norm": 0.4654010236263275, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 1720 + }, + { + "epoch": 2.2526041666666665, + "grad_norm": 0.49796584248542786, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 1730 + }, + { + "epoch": 2.265625, + "grad_norm": 0.4506736397743225, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1740 + }, + { + "epoch": 2.2786458333333335, + "grad_norm": 0.46757954359054565, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1750 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.4507335424423218, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 1760 + }, + { + "epoch": 2.3046875, + "grad_norm": 0.43900197744369507, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1770 + }, + { + "epoch": 2.3177083333333335, + "grad_norm": 0.48013004660606384, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1780 + }, + { + "epoch": 2.3307291666666665, + "grad_norm": 0.41891220211982727, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1790 + }, + { + "epoch": 2.34375, + "grad_norm": 0.4879191219806671, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1800 + }, + { + "epoch": 2.3567708333333335, + "grad_norm": 0.46148231625556946, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1810 + }, + { + "epoch": 2.3697916666666665, + "grad_norm": 0.5114223957061768, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 1820 + }, + { + "epoch": 2.3828125, + "grad_norm": 0.4828612804412842, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 1830 + }, + { + "epoch": 2.3958333333333335, + "grad_norm": 0.4672335386276245, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1840 + }, + { + "epoch": 2.4088541666666665, + "grad_norm": 0.4914792776107788, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1850 + }, + { + "epoch": 2.421875, + "grad_norm": 0.44478079676628113, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 1860 + }, + { + "epoch": 2.4348958333333335, + "grad_norm": 0.4601325988769531, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 1870 + }, + { + "epoch": 2.4479166666666665, + "grad_norm": 0.44539815187454224, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 1880 + }, + { + "epoch": 2.4609375, + "grad_norm": 0.4532422125339508, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 1890 + }, + { + "epoch": 2.4739583333333335, + "grad_norm": 0.5323562622070312, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 1900 + }, + { + "epoch": 2.4869791666666665, + "grad_norm": 0.5027516484260559, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1910 + }, + { + "epoch": 2.5, + "grad_norm": 0.4507808983325958, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 1920 + }, + { + "epoch": 2.5130208333333335, + "grad_norm": 0.4996422827243805, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1930 + }, + { + "epoch": 2.5260416666666665, + "grad_norm": 0.4964800179004669, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 1940 + }, + { + "epoch": 2.5390625, + "grad_norm": 0.48546481132507324, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 1950 + }, + { + "epoch": 2.5520833333333335, + "grad_norm": 0.47357916831970215, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1960 + }, + { + "epoch": 2.5651041666666665, + "grad_norm": 0.47136595845222473, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 1970 + }, + { + "epoch": 2.578125, + "grad_norm": 0.5185502171516418, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 1980 + }, + { + "epoch": 2.5911458333333335, + "grad_norm": 0.47995880246162415, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 1990 + }, + { + "epoch": 2.6041666666666665, + "grad_norm": 0.5076674222946167, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2000 + }, + { + "epoch": 2.6171875, + "grad_norm": 0.4805421233177185, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2010 + }, + { + "epoch": 2.6302083333333335, + "grad_norm": 0.4406864047050476, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2020 + }, + { + "epoch": 2.6432291666666665, + "grad_norm": 0.521388828754425, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2030 + }, + { + "epoch": 2.65625, + "grad_norm": 0.4531918466091156, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 2040 + }, + { + "epoch": 2.6692708333333335, + "grad_norm": 0.45295774936676025, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2050 + }, + { + "epoch": 2.6822916666666665, + "grad_norm": 0.4573723375797272, + "learning_rate": 0.0002, + "loss": 1.5252, + "step": 2060 + }, + { + "epoch": 2.6953125, + "grad_norm": 0.4836064279079437, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2070 + }, + { + "epoch": 2.7083333333333335, + "grad_norm": 0.5040885210037231, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2080 + }, + { + "epoch": 2.7213541666666665, + "grad_norm": 0.5153458118438721, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2090 + }, + { + "epoch": 2.734375, + "grad_norm": 0.4415692090988159, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2100 + }, + { + "epoch": 2.7473958333333335, + "grad_norm": 0.4862712621688843, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2110 + }, + { + "epoch": 2.7604166666666665, + "grad_norm": 0.4845922589302063, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2120 + }, + { + "epoch": 2.7734375, + "grad_norm": 0.5153566598892212, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 2130 + }, + { + "epoch": 2.7864583333333335, + "grad_norm": 0.4220491945743561, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 2140 + }, + { + "epoch": 2.7994791666666665, + "grad_norm": 0.523292064666748, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 2150 + }, + { + "epoch": 2.8125, + "grad_norm": 0.4567972421646118, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2160 + }, + { + "epoch": 2.8255208333333335, + "grad_norm": 0.6252557039260864, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2170 + }, + { + "epoch": 2.8385416666666665, + "grad_norm": 0.5231373310089111, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 2180 + }, + { + "epoch": 2.8515625, + "grad_norm": 0.49243974685668945, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 2190 + }, + { + "epoch": 2.8645833333333335, + "grad_norm": 0.521644115447998, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2200 + }, + { + "epoch": 2.8776041666666665, + "grad_norm": 0.4624195694923401, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2210 + }, + { + "epoch": 2.890625, + "grad_norm": 0.4463620185852051, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2220 + }, + { + "epoch": 2.9036458333333335, + "grad_norm": 0.45793524384498596, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 2230 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.46979188919067383, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2240 + }, + { + "epoch": 2.9296875, + "grad_norm": 0.5220303535461426, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2250 + }, + { + "epoch": 2.9427083333333335, + "grad_norm": 0.44405895471572876, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2260 + }, + { + "epoch": 2.9557291666666665, + "grad_norm": 0.523841381072998, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 2270 + }, + { + "epoch": 2.96875, + "grad_norm": 0.4928138852119446, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2280 + }, + { + "epoch": 2.9817708333333335, + "grad_norm": 0.4918071925640106, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 2290 + }, + { + "epoch": 2.9947916666666665, + "grad_norm": 0.4584912061691284, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2300 + }, + { + "epoch": 3.0, + "eval_loss": 1.8474308252334595, + "eval_runtime": 103.7697, + "eval_samples_per_second": 4.963, + "eval_steps_per_second": 0.626, + "step": 2304 + }, + { + "epoch": 3.0078125, + "grad_norm": 0.4801871180534363, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 2310 + }, + { + "epoch": 3.0208333333333335, + "grad_norm": 0.5789998173713684, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 2320 + }, + { + "epoch": 3.0338541666666665, + "grad_norm": 0.49856704473495483, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2330 + }, + { + "epoch": 3.046875, + "grad_norm": 0.5625631213188171, + "learning_rate": 0.0002, + "loss": 1.4718, + "step": 2340 + }, + { + "epoch": 3.0598958333333335, + "grad_norm": 0.557637095451355, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 2350 + }, + { + "epoch": 3.0729166666666665, + "grad_norm": 0.528889536857605, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 2360 + }, + { + "epoch": 3.0859375, + "grad_norm": 0.5952284932136536, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 2370 + }, + { + "epoch": 3.0989583333333335, + "grad_norm": 0.5549899339675903, + "learning_rate": 0.0002, + "loss": 1.5304, + "step": 2380 + }, + { + "epoch": 3.1119791666666665, + "grad_norm": 0.662139892578125, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 2390 + }, + { + "epoch": 3.125, + "grad_norm": 0.5281530618667603, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 2400 + }, + { + "epoch": 3.1380208333333335, + "grad_norm": 0.6134106516838074, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 2410 + }, + { + "epoch": 3.1510416666666665, + "grad_norm": 0.6040887236595154, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2420 + }, + { + "epoch": 3.1640625, + "grad_norm": 0.549672544002533, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 2430 + }, + { + "epoch": 3.1770833333333335, + "grad_norm": 0.9195653796195984, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 2440 + }, + { + "epoch": 3.1901041666666665, + "grad_norm": 0.5578703284263611, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 2450 + }, + { + "epoch": 3.203125, + "grad_norm": 0.5982925891876221, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 2460 + }, + { + "epoch": 3.2161458333333335, + "grad_norm": 0.5544393062591553, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 2470 + }, + { + "epoch": 3.2291666666666665, + "grad_norm": 0.6015266180038452, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2480 + }, + { + "epoch": 3.2421875, + "grad_norm": 0.5995243191719055, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2490 + }, + { + "epoch": 3.2552083333333335, + "grad_norm": 0.5846129059791565, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 2500 + }, + { + "epoch": 3.2682291666666665, + "grad_norm": 0.5552570223808289, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 2510 + }, + { + "epoch": 3.28125, + "grad_norm": 0.576998233795166, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 2520 + }, + { + "epoch": 3.2942708333333335, + "grad_norm": 0.6526138186454773, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2530 + }, + { + "epoch": 3.3072916666666665, + "grad_norm": 0.6064265966415405, + "learning_rate": 0.0002, + "loss": 1.474, + "step": 2540 + }, + { + "epoch": 3.3203125, + "grad_norm": 0.5542362928390503, + "learning_rate": 0.0002, + "loss": 1.5125, + "step": 2550 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6048482060432434, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 2560 + }, + { + "epoch": 3.3463541666666665, + "grad_norm": 0.6328344941139221, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 2570 + }, + { + "epoch": 3.359375, + "grad_norm": 0.6347311735153198, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 2580 + }, + { + "epoch": 3.3723958333333335, + "grad_norm": 0.537570595741272, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 2590 + }, + { + "epoch": 3.3854166666666665, + "grad_norm": 0.5704807639122009, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 2600 + }, + { + "epoch": 3.3984375, + "grad_norm": 0.5914373993873596, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 2610 + }, + { + "epoch": 3.4114583333333335, + "grad_norm": 0.6724640130996704, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 2620 + }, + { + "epoch": 3.4244791666666665, + "grad_norm": 0.6295472383499146, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 2630 + }, + { + "epoch": 3.4375, + "grad_norm": 0.5842770934104919, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 2640 + }, + { + "epoch": 3.4505208333333335, + "grad_norm": 0.6297776699066162, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 2650 + }, + { + "epoch": 3.4635416666666665, + "grad_norm": 0.6105847358703613, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2660 + }, + { + "epoch": 3.4765625, + "grad_norm": 0.6294940710067749, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 2670 + }, + { + "epoch": 3.4895833333333335, + "grad_norm": 0.6573333740234375, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2680 + }, + { + "epoch": 3.5026041666666665, + "grad_norm": 0.663661539554596, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 2690 + }, + { + "epoch": 3.515625, + "grad_norm": 0.6729148626327515, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 2700 + }, + { + "epoch": 3.5286458333333335, + "grad_norm": 0.6633102893829346, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 2710 + }, + { + "epoch": 3.5416666666666665, + "grad_norm": 0.567686915397644, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 2720 + }, + { + "epoch": 3.5546875, + "grad_norm": 0.6281962394714355, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 2730 + }, + { + "epoch": 3.5677083333333335, + "grad_norm": 0.5710738897323608, + "learning_rate": 0.0002, + "loss": 1.5028, + "step": 2740 + }, + { + "epoch": 3.5807291666666665, + "grad_norm": 0.648162305355072, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 2750 + }, + { + "epoch": 3.59375, + "grad_norm": 0.5466254949569702, + "learning_rate": 0.0002, + "loss": 1.4294, + "step": 2760 + }, + { + "epoch": 3.6067708333333335, + "grad_norm": 0.6867973208427429, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2770 + }, + { + "epoch": 3.6197916666666665, + "grad_norm": 0.673612117767334, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2780 + }, + { + "epoch": 3.6328125, + "grad_norm": 0.6928417086601257, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2790 + }, + { + "epoch": 3.6458333333333335, + "grad_norm": 0.6603742837905884, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2800 + }, + { + "epoch": 3.6588541666666665, + "grad_norm": 0.5964401960372925, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 2810 + }, + { + "epoch": 3.671875, + "grad_norm": 0.6224474310874939, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 2820 + }, + { + "epoch": 3.6848958333333335, + "grad_norm": 0.6592439413070679, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 2830 + }, + { + "epoch": 3.6979166666666665, + "grad_norm": 0.6255369186401367, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 2840 + }, + { + "epoch": 3.7109375, + "grad_norm": 0.7136337757110596, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 2850 + }, + { + "epoch": 3.7239583333333335, + "grad_norm": 0.6229757070541382, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 2860 + }, + { + "epoch": 3.7369791666666665, + "grad_norm": 0.696080207824707, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2870 + }, + { + "epoch": 3.75, + "grad_norm": 0.571873664855957, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2880 + }, + { + "epoch": 3.7630208333333335, + "grad_norm": 0.5918916463851929, + "learning_rate": 0.0002, + "loss": 1.4093, + "step": 2890 + }, + { + "epoch": 3.7760416666666665, + "grad_norm": 0.616413950920105, + "learning_rate": 0.0002, + "loss": 1.399, + "step": 2900 + }, + { + "epoch": 3.7890625, + "grad_norm": 0.6267292499542236, + "learning_rate": 0.0002, + "loss": 1.4215, + "step": 2910 + }, + { + "epoch": 3.8020833333333335, + "grad_norm": 0.6630783677101135, + "learning_rate": 0.0002, + "loss": 1.5095, + "step": 2920 + }, + { + "epoch": 3.8151041666666665, + "grad_norm": 0.6004238724708557, + "learning_rate": 0.0002, + "loss": 1.5323, + "step": 2930 + }, + { + "epoch": 3.828125, + "grad_norm": 0.6740423440933228, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2940 + }, + { + "epoch": 3.8411458333333335, + "grad_norm": 0.6397785544395447, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 2950 + }, + { + "epoch": 3.8541666666666665, + "grad_norm": 0.6063735485076904, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2960 + }, + { + "epoch": 3.8671875, + "grad_norm": 0.6462053060531616, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 2970 + }, + { + "epoch": 3.8802083333333335, + "grad_norm": 0.7143250107765198, + "learning_rate": 0.0002, + "loss": 1.5237, + "step": 2980 + }, + { + "epoch": 3.8932291666666665, + "grad_norm": 0.6747874617576599, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2990 + }, + { + "epoch": 3.90625, + "grad_norm": 0.622930109500885, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 3000 + }, + { + "epoch": 3.9192708333333335, + "grad_norm": 0.620193600654602, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 3010 + }, + { + "epoch": 3.9322916666666665, + "grad_norm": 0.6321487426757812, + "learning_rate": 0.0002, + "loss": 1.495, + "step": 3020 + }, + { + "epoch": 3.9453125, + "grad_norm": 0.5705523490905762, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 3030 + }, + { + "epoch": 3.9583333333333335, + "grad_norm": 0.6185072660446167, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 3040 + }, + { + "epoch": 3.9713541666666665, + "grad_norm": 0.6005704998970032, + "learning_rate": 0.0002, + "loss": 1.4667, + "step": 3050 + }, + { + "epoch": 3.984375, + "grad_norm": 0.5933769941329956, + "learning_rate": 0.0002, + "loss": 1.4896, + "step": 3060 + }, + { + "epoch": 3.9973958333333335, + "grad_norm": 0.695209801197052, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 3070 + }, + { + "epoch": 4.0, + "eval_loss": 1.8955267667770386, + "eval_runtime": 103.5061, + "eval_samples_per_second": 4.976, + "eval_steps_per_second": 0.628, + "step": 3072 + }, + { + "epoch": 4.010416666666667, + "grad_norm": 0.6706188321113586, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 3080 + }, + { + "epoch": 4.0234375, + "grad_norm": 0.7263980507850647, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3090 + }, + { + "epoch": 4.036458333333333, + "grad_norm": 0.7767240405082703, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 3100 + }, + { + "epoch": 4.049479166666667, + "grad_norm": 0.6888399124145508, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 3110 + }, + { + "epoch": 4.0625, + "grad_norm": 0.8860331773757935, + "learning_rate": 0.0002, + "loss": 1.2422, + "step": 3120 + }, + { + "epoch": 4.075520833333333, + "grad_norm": 0.7572373151779175, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3130 + }, + { + "epoch": 4.088541666666667, + "grad_norm": 0.8321536183357239, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 3140 + }, + { + "epoch": 4.1015625, + "grad_norm": 0.7042664885520935, + "learning_rate": 0.0002, + "loss": 1.2843, + "step": 3150 + }, + { + "epoch": 4.114583333333333, + "grad_norm": 0.8910216689109802, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 3160 + }, + { + "epoch": 4.127604166666667, + "grad_norm": 0.8333232402801514, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3170 + }, + { + "epoch": 4.140625, + "grad_norm": 0.7120883464813232, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 3180 + }, + { + "epoch": 4.153645833333333, + "grad_norm": 0.6904631853103638, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 3190 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.6398878693580627, + "learning_rate": 0.0002, + "loss": 1.2881, + "step": 3200 + }, + { + "epoch": 4.1796875, + "grad_norm": 0.7573692798614502, + "learning_rate": 0.0002, + "loss": 1.3323, + "step": 3210 + }, + { + "epoch": 4.192708333333333, + "grad_norm": 0.7850743532180786, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 3220 + }, + { + "epoch": 4.205729166666667, + "grad_norm": 0.7863165736198425, + "learning_rate": 0.0002, + "loss": 1.3176, + "step": 3230 + }, + { + "epoch": 4.21875, + "grad_norm": 0.7855865359306335, + "learning_rate": 0.0002, + "loss": 1.3739, + "step": 3240 + }, + { + "epoch": 4.231770833333333, + "grad_norm": 0.6840922832489014, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 3250 + }, + { + "epoch": 4.244791666666667, + "grad_norm": 0.8499747514724731, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 3260 + }, + { + "epoch": 4.2578125, + "grad_norm": 0.7982883453369141, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 3270 + }, + { + "epoch": 4.270833333333333, + "grad_norm": 0.7776934504508972, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 3280 + }, + { + "epoch": 4.283854166666667, + "grad_norm": 0.8887693881988525, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 3290 + }, + { + "epoch": 4.296875, + "grad_norm": 1.0184714794158936, + "learning_rate": 0.0002, + "loss": 1.3213, + "step": 3300 + }, + { + "epoch": 4.309895833333333, + "grad_norm": 0.7539387345314026, + "learning_rate": 0.0002, + "loss": 1.3212, + "step": 3310 + }, + { + "epoch": 4.322916666666667, + "grad_norm": 0.8137491345405579, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 3320 + }, + { + "epoch": 4.3359375, + "grad_norm": 0.8136276006698608, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 3330 + }, + { + "epoch": 4.348958333333333, + "grad_norm": 0.7880964279174805, + "learning_rate": 0.0002, + "loss": 1.3512, + "step": 3340 + }, + { + "epoch": 4.361979166666667, + "grad_norm": 0.8654456734657288, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 3350 + }, + { + "epoch": 4.375, + "grad_norm": 0.8093366622924805, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3360 + }, + { + "epoch": 4.388020833333333, + "grad_norm": 0.8738575577735901, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 3370 + }, + { + "epoch": 4.401041666666667, + "grad_norm": 0.8923026919364929, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3380 + }, + { + "epoch": 4.4140625, + "grad_norm": 0.8508910536766052, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 3390 + }, + { + "epoch": 4.427083333333333, + "grad_norm": 0.8262084722518921, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 3400 + }, + { + "epoch": 4.440104166666667, + "grad_norm": 0.7843561768531799, + "learning_rate": 0.0002, + "loss": 1.3145, + "step": 3410 + }, + { + "epoch": 4.453125, + "grad_norm": 0.9087795615196228, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3420 + }, + { + "epoch": 4.466145833333333, + "grad_norm": 0.8278809189796448, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3430 + }, + { + "epoch": 4.479166666666667, + "grad_norm": 0.8337010741233826, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 3440 + }, + { + "epoch": 4.4921875, + "grad_norm": 0.7790088057518005, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 3450 + }, + { + "epoch": 4.505208333333333, + "grad_norm": 0.826231837272644, + "learning_rate": 0.0002, + "loss": 1.3344, + "step": 3460 + }, + { + "epoch": 4.518229166666667, + "grad_norm": 0.761461079120636, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 3470 + }, + { + "epoch": 4.53125, + "grad_norm": 0.8892785906791687, + "learning_rate": 0.0002, + "loss": 1.2829, + "step": 3480 + }, + { + "epoch": 4.544270833333333, + "grad_norm": 0.6087225675582886, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 3490 + }, + { + "epoch": 4.557291666666667, + "grad_norm": 0.8259274363517761, + "learning_rate": 0.0002, + "loss": 1.3167, + "step": 3500 + }, + { + "epoch": 4.5703125, + "grad_norm": 0.821164071559906, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3510 + }, + { + "epoch": 4.583333333333333, + "grad_norm": 0.7262887954711914, + "learning_rate": 0.0002, + "loss": 1.2853, + "step": 3520 + }, + { + "epoch": 4.596354166666667, + "grad_norm": 0.8564826250076294, + "learning_rate": 0.0002, + "loss": 1.3777, + "step": 3530 + }, + { + "epoch": 4.609375, + "grad_norm": 0.8072929978370667, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3540 + }, + { + "epoch": 4.622395833333333, + "grad_norm": 0.8040832877159119, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 3550 + }, + { + "epoch": 4.635416666666667, + "grad_norm": 0.7268754839897156, + "learning_rate": 0.0002, + "loss": 1.2863, + "step": 3560 + }, + { + "epoch": 4.6484375, + "grad_norm": 0.9985134601593018, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3570 + }, + { + "epoch": 4.661458333333333, + "grad_norm": 0.9826098680496216, + "learning_rate": 0.0002, + "loss": 1.3221, + "step": 3580 + }, + { + "epoch": 4.674479166666667, + "grad_norm": 0.8794422149658203, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 3590 + }, + { + "epoch": 4.6875, + "grad_norm": 0.7207489609718323, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3600 + }, + { + "epoch": 4.700520833333333, + "grad_norm": 0.7546059489250183, + "learning_rate": 0.0002, + "loss": 1.3192, + "step": 3610 + }, + { + "epoch": 4.713541666666667, + "grad_norm": 0.8318526148796082, + "learning_rate": 0.0002, + "loss": 1.3445, + "step": 3620 + }, + { + "epoch": 4.7265625, + "grad_norm": 0.7529309391975403, + "learning_rate": 0.0002, + "loss": 1.3847, + "step": 3630 + }, + { + "epoch": 4.739583333333333, + "grad_norm": 0.7762532234191895, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3640 + }, + { + "epoch": 4.752604166666667, + "grad_norm": 0.9306083917617798, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 3650 + }, + { + "epoch": 4.765625, + "grad_norm": 0.8050256967544556, + "learning_rate": 0.0002, + "loss": 1.3828, + "step": 3660 + }, + { + "epoch": 4.778645833333333, + "grad_norm": 0.8114449381828308, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3670 + }, + { + "epoch": 4.791666666666667, + "grad_norm": 0.8125811815261841, + "learning_rate": 0.0002, + "loss": 1.3296, + "step": 3680 + }, + { + "epoch": 4.8046875, + "grad_norm": 0.7642565369606018, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 3690 + }, + { + "epoch": 4.817708333333333, + "grad_norm": 0.8970131874084473, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3700 + }, + { + "epoch": 4.830729166666667, + "grad_norm": 0.7654327154159546, + "learning_rate": 0.0002, + "loss": 1.3983, + "step": 3710 + }, + { + "epoch": 4.84375, + "grad_norm": 0.7605378031730652, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 3720 + }, + { + "epoch": 4.856770833333333, + "grad_norm": 0.8340551257133484, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 3730 + }, + { + "epoch": 4.869791666666667, + "grad_norm": 0.7273691296577454, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 3740 + }, + { + "epoch": 4.8828125, + "grad_norm": 0.9718272686004639, + "learning_rate": 0.0002, + "loss": 1.3094, + "step": 3750 + }, + { + "epoch": 4.895833333333333, + "grad_norm": 0.7891847491264343, + "learning_rate": 0.0002, + "loss": 1.296, + "step": 3760 + }, + { + "epoch": 4.908854166666667, + "grad_norm": 0.9090818166732788, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 3770 + }, + { + "epoch": 4.921875, + "grad_norm": 0.7963318824768066, + "learning_rate": 0.0002, + "loss": 1.3478, + "step": 3780 + }, + { + "epoch": 4.934895833333333, + "grad_norm": 0.7588343620300293, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 3790 + }, + { + "epoch": 4.947916666666667, + "grad_norm": 0.84076327085495, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3800 + }, + { + "epoch": 4.9609375, + "grad_norm": 0.7767227292060852, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 3810 + }, + { + "epoch": 4.973958333333333, + "grad_norm": 0.8101866245269775, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3820 + }, + { + "epoch": 4.986979166666667, + "grad_norm": 0.7808696627616882, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 3830 + }, + { + "epoch": 5.0, + "grad_norm": 0.9609483480453491, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 3840 + }, + { + "epoch": 5.0, + "eval_loss": 1.9610719680786133, + "eval_runtime": 87.6572, + "eval_samples_per_second": 5.875, + "eval_steps_per_second": 0.742, + "step": 3840 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7770660189175808e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-3840/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ef7211282618d021ded61225e492a6e25e83d67e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40dfd62335a8b3044ea0464119aaf6a19f2a0a8100e71e3788c1ca800e3e4ec7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..247e3839b0505e96b5729e40a54a9bf60d95227c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fd17fee26e474621a874d46e5238aabe268a5aa337caac9233003db0f6a0997 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..41986bc78cf5c711012cae19d2316a430c53b978 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc8d1140a299874ea3d75ed2ad0b41bd71b26ced4af0761321483d70948b70a +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..aebce7c8912188cf635a27174b9ae449ceb972d2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5599c42554c7887df8627ffa94e2ffe2aa4715e4c59bbd320d37ef544cb6e736 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eea12a7e1847a314ad24fc5771cfc0f71a14640e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/trainer_state.json @@ -0,0 +1,3301 @@ +{ + "best_metric": 1.820037841796875, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 4608, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 0.303970068693161, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 770 + }, + { + "epoch": 1.015625, + "grad_norm": 0.32745876908302307, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 780 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 0.33467888832092285, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.38253068923950195, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 800 + }, + { + "epoch": 1.0546875, + "grad_norm": 0.3955802023410797, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 810 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.3534117043018341, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 820 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 0.33427858352661133, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 830 + }, + { + "epoch": 1.09375, + "grad_norm": 0.35261571407318115, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 0.4416263997554779, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 850 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.3918050229549408, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 860 + }, + { + "epoch": 1.1328125, + "grad_norm": 0.38482677936553955, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 870 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.4945143759250641, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 880 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 0.429677814245224, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 890 + }, + { + "epoch": 1.171875, + "grad_norm": 0.41878288984298706, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 900 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 0.41578373312950134, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 910 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.37028902769088745, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 920 + }, + { + "epoch": 1.2109375, + "grad_norm": 0.3824995756149292, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 930 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.3818865418434143, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 940 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 0.3930460810661316, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 950 + }, + { + "epoch": 1.25, + "grad_norm": 0.3904426395893097, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 960 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 0.4175802171230316, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 970 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.42343786358833313, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 980 + }, + { + "epoch": 1.2890625, + "grad_norm": 0.4168420135974884, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 990 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.38692983984947205, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1000 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 0.5037692189216614, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 1010 + }, + { + "epoch": 1.328125, + "grad_norm": 0.39436691999435425, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 1020 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 0.3431943356990814, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1030 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.39167070388793945, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 1040 + }, + { + "epoch": 1.3671875, + "grad_norm": 0.3820446729660034, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1050 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.4190749526023865, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1060 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 0.3618869185447693, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1070 + }, + { + "epoch": 1.40625, + "grad_norm": 0.38852423429489136, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1080 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 0.49829256534576416, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 1090 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.3956700563430786, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 1100 + }, + { + "epoch": 1.4453125, + "grad_norm": 0.38829147815704346, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1110 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.37237483263015747, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 1120 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 0.39798808097839355, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 1.484375, + "grad_norm": 0.38188642263412476, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 1140 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 0.44961944222450256, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1150 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.3816550374031067, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 1160 + }, + { + "epoch": 1.5234375, + "grad_norm": 0.3885478973388672, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1170 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.42779695987701416, + "learning_rate": 0.0002, + "loss": 1.7285, + "step": 1180 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 0.41499748826026917, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 1190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4319412410259247, + "learning_rate": 0.0002, + "loss": 1.6569, + "step": 1200 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 0.38847389817237854, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 1210 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.45832890272140503, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 1220 + }, + { + "epoch": 1.6015625, + "grad_norm": 0.45928797125816345, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 1230 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.4052276611328125, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 0.4031650424003601, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 1250 + }, + { + "epoch": 1.640625, + "grad_norm": 0.36724114418029785, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1260 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 0.4188505709171295, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 1270 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3982168138027191, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 1280 + }, + { + "epoch": 1.6796875, + "grad_norm": 0.3768596053123474, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1290 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.3843287527561188, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1300 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 0.3982345461845398, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1310 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3407546281814575, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 1320 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 0.36327359080314636, + "learning_rate": 0.0002, + "loss": 1.7316, + "step": 1330 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.4141675531864166, + "learning_rate": 0.0002, + "loss": 1.734, + "step": 1340 + }, + { + "epoch": 1.7578125, + "grad_norm": 0.43894267082214355, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1350 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.40564292669296265, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 1360 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 0.3978462815284729, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1370 + }, + { + "epoch": 1.796875, + "grad_norm": 0.37140771746635437, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1380 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 0.43164145946502686, + "learning_rate": 0.0002, + "loss": 1.742, + "step": 1390 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.38034674525260925, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1400 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1410 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.37417489290237427, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1420 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 0.4303789734840393, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1430 + }, + { + "epoch": 1.875, + "grad_norm": 0.43942129611968994, + "learning_rate": 0.0002, + "loss": 1.6489, + "step": 1440 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 0.3866581320762634, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 1450 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3686903417110443, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1460 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.3885461986064911, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 1470 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.4156927466392517, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1480 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 0.3934236168861389, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 1490 + }, + { + "epoch": 1.953125, + "grad_norm": 0.38645586371421814, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 1500 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 0.43272635340690613, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1510 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.42476025223731995, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1520 + }, + { + "epoch": 1.9921875, + "grad_norm": 0.37216147780418396, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1530 + }, + { + "epoch": 2.0, + "eval_loss": 1.820037841796875, + "eval_runtime": 101.0456, + "eval_samples_per_second": 5.097, + "eval_steps_per_second": 0.643, + "step": 1536 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 0.39003029465675354, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1540 + }, + { + "epoch": 2.0182291666666665, + "grad_norm": 0.4302637577056885, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 1550 + }, + { + "epoch": 2.03125, + "grad_norm": 0.4496043026447296, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1560 + }, + { + "epoch": 2.0442708333333335, + "grad_norm": 0.42824679613113403, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1570 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 0.44775739312171936, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 1580 + }, + { + "epoch": 2.0703125, + "grad_norm": 0.4705299735069275, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1590 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.4614814817905426, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1600 + }, + { + "epoch": 2.0963541666666665, + "grad_norm": 0.45097213983535767, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1610 + }, + { + "epoch": 2.109375, + "grad_norm": 0.41954323649406433, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 1620 + }, + { + "epoch": 2.1223958333333335, + "grad_norm": 0.44894352555274963, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 1630 + }, + { + "epoch": 2.1354166666666665, + "grad_norm": 0.4421502947807312, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1640 + }, + { + "epoch": 2.1484375, + "grad_norm": 0.44649967551231384, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1650 + }, + { + "epoch": 2.1614583333333335, + "grad_norm": 0.44216716289520264, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 1660 + }, + { + "epoch": 2.1744791666666665, + "grad_norm": 0.6363232135772705, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 1670 + }, + { + "epoch": 2.1875, + "grad_norm": 0.46533334255218506, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1680 + }, + { + "epoch": 2.2005208333333335, + "grad_norm": 0.48486822843551636, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 1690 + }, + { + "epoch": 2.2135416666666665, + "grad_norm": 0.43277066946029663, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 1700 + }, + { + "epoch": 2.2265625, + "grad_norm": 0.45927226543426514, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 1710 + }, + { + "epoch": 2.2395833333333335, + "grad_norm": 0.4654010236263275, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 1720 + }, + { + "epoch": 2.2526041666666665, + "grad_norm": 0.49796584248542786, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 1730 + }, + { + "epoch": 2.265625, + "grad_norm": 0.4506736397743225, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1740 + }, + { + "epoch": 2.2786458333333335, + "grad_norm": 0.46757954359054565, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1750 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.4507335424423218, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 1760 + }, + { + "epoch": 2.3046875, + "grad_norm": 0.43900197744369507, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1770 + }, + { + "epoch": 2.3177083333333335, + "grad_norm": 0.48013004660606384, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1780 + }, + { + "epoch": 2.3307291666666665, + "grad_norm": 0.41891220211982727, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1790 + }, + { + "epoch": 2.34375, + "grad_norm": 0.4879191219806671, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1800 + }, + { + "epoch": 2.3567708333333335, + "grad_norm": 0.46148231625556946, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1810 + }, + { + "epoch": 2.3697916666666665, + "grad_norm": 0.5114223957061768, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 1820 + }, + { + "epoch": 2.3828125, + "grad_norm": 0.4828612804412842, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 1830 + }, + { + "epoch": 2.3958333333333335, + "grad_norm": 0.4672335386276245, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1840 + }, + { + "epoch": 2.4088541666666665, + "grad_norm": 0.4914792776107788, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1850 + }, + { + "epoch": 2.421875, + "grad_norm": 0.44478079676628113, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 1860 + }, + { + "epoch": 2.4348958333333335, + "grad_norm": 0.4601325988769531, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 1870 + }, + { + "epoch": 2.4479166666666665, + "grad_norm": 0.44539815187454224, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 1880 + }, + { + "epoch": 2.4609375, + "grad_norm": 0.4532422125339508, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 1890 + }, + { + "epoch": 2.4739583333333335, + "grad_norm": 0.5323562622070312, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 1900 + }, + { + "epoch": 2.4869791666666665, + "grad_norm": 0.5027516484260559, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1910 + }, + { + "epoch": 2.5, + "grad_norm": 0.4507808983325958, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 1920 + }, + { + "epoch": 2.5130208333333335, + "grad_norm": 0.4996422827243805, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1930 + }, + { + "epoch": 2.5260416666666665, + "grad_norm": 0.4964800179004669, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 1940 + }, + { + "epoch": 2.5390625, + "grad_norm": 0.48546481132507324, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 1950 + }, + { + "epoch": 2.5520833333333335, + "grad_norm": 0.47357916831970215, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1960 + }, + { + "epoch": 2.5651041666666665, + "grad_norm": 0.47136595845222473, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 1970 + }, + { + "epoch": 2.578125, + "grad_norm": 0.5185502171516418, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 1980 + }, + { + "epoch": 2.5911458333333335, + "grad_norm": 0.47995880246162415, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 1990 + }, + { + "epoch": 2.6041666666666665, + "grad_norm": 0.5076674222946167, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2000 + }, + { + "epoch": 2.6171875, + "grad_norm": 0.4805421233177185, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2010 + }, + { + "epoch": 2.6302083333333335, + "grad_norm": 0.4406864047050476, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2020 + }, + { + "epoch": 2.6432291666666665, + "grad_norm": 0.521388828754425, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2030 + }, + { + "epoch": 2.65625, + "grad_norm": 0.4531918466091156, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 2040 + }, + { + "epoch": 2.6692708333333335, + "grad_norm": 0.45295774936676025, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2050 + }, + { + "epoch": 2.6822916666666665, + "grad_norm": 0.4573723375797272, + "learning_rate": 0.0002, + "loss": 1.5252, + "step": 2060 + }, + { + "epoch": 2.6953125, + "grad_norm": 0.4836064279079437, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2070 + }, + { + "epoch": 2.7083333333333335, + "grad_norm": 0.5040885210037231, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2080 + }, + { + "epoch": 2.7213541666666665, + "grad_norm": 0.5153458118438721, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2090 + }, + { + "epoch": 2.734375, + "grad_norm": 0.4415692090988159, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2100 + }, + { + "epoch": 2.7473958333333335, + "grad_norm": 0.4862712621688843, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2110 + }, + { + "epoch": 2.7604166666666665, + "grad_norm": 0.4845922589302063, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2120 + }, + { + "epoch": 2.7734375, + "grad_norm": 0.5153566598892212, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 2130 + }, + { + "epoch": 2.7864583333333335, + "grad_norm": 0.4220491945743561, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 2140 + }, + { + "epoch": 2.7994791666666665, + "grad_norm": 0.523292064666748, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 2150 + }, + { + "epoch": 2.8125, + "grad_norm": 0.4567972421646118, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2160 + }, + { + "epoch": 2.8255208333333335, + "grad_norm": 0.6252557039260864, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2170 + }, + { + "epoch": 2.8385416666666665, + "grad_norm": 0.5231373310089111, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 2180 + }, + { + "epoch": 2.8515625, + "grad_norm": 0.49243974685668945, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 2190 + }, + { + "epoch": 2.8645833333333335, + "grad_norm": 0.521644115447998, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2200 + }, + { + "epoch": 2.8776041666666665, + "grad_norm": 0.4624195694923401, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2210 + }, + { + "epoch": 2.890625, + "grad_norm": 0.4463620185852051, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2220 + }, + { + "epoch": 2.9036458333333335, + "grad_norm": 0.45793524384498596, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 2230 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.46979188919067383, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2240 + }, + { + "epoch": 2.9296875, + "grad_norm": 0.5220303535461426, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2250 + }, + { + "epoch": 2.9427083333333335, + "grad_norm": 0.44405895471572876, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2260 + }, + { + "epoch": 2.9557291666666665, + "grad_norm": 0.523841381072998, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 2270 + }, + { + "epoch": 2.96875, + "grad_norm": 0.4928138852119446, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2280 + }, + { + "epoch": 2.9817708333333335, + "grad_norm": 0.4918071925640106, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 2290 + }, + { + "epoch": 2.9947916666666665, + "grad_norm": 0.4584912061691284, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2300 + }, + { + "epoch": 3.0, + "eval_loss": 1.8474308252334595, + "eval_runtime": 103.7697, + "eval_samples_per_second": 4.963, + "eval_steps_per_second": 0.626, + "step": 2304 + }, + { + "epoch": 3.0078125, + "grad_norm": 0.4801871180534363, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 2310 + }, + { + "epoch": 3.0208333333333335, + "grad_norm": 0.5789998173713684, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 2320 + }, + { + "epoch": 3.0338541666666665, + "grad_norm": 0.49856704473495483, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2330 + }, + { + "epoch": 3.046875, + "grad_norm": 0.5625631213188171, + "learning_rate": 0.0002, + "loss": 1.4718, + "step": 2340 + }, + { + "epoch": 3.0598958333333335, + "grad_norm": 0.557637095451355, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 2350 + }, + { + "epoch": 3.0729166666666665, + "grad_norm": 0.528889536857605, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 2360 + }, + { + "epoch": 3.0859375, + "grad_norm": 0.5952284932136536, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 2370 + }, + { + "epoch": 3.0989583333333335, + "grad_norm": 0.5549899339675903, + "learning_rate": 0.0002, + "loss": 1.5304, + "step": 2380 + }, + { + "epoch": 3.1119791666666665, + "grad_norm": 0.662139892578125, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 2390 + }, + { + "epoch": 3.125, + "grad_norm": 0.5281530618667603, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 2400 + }, + { + "epoch": 3.1380208333333335, + "grad_norm": 0.6134106516838074, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 2410 + }, + { + "epoch": 3.1510416666666665, + "grad_norm": 0.6040887236595154, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2420 + }, + { + "epoch": 3.1640625, + "grad_norm": 0.549672544002533, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 2430 + }, + { + "epoch": 3.1770833333333335, + "grad_norm": 0.9195653796195984, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 2440 + }, + { + "epoch": 3.1901041666666665, + "grad_norm": 0.5578703284263611, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 2450 + }, + { + "epoch": 3.203125, + "grad_norm": 0.5982925891876221, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 2460 + }, + { + "epoch": 3.2161458333333335, + "grad_norm": 0.5544393062591553, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 2470 + }, + { + "epoch": 3.2291666666666665, + "grad_norm": 0.6015266180038452, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2480 + }, + { + "epoch": 3.2421875, + "grad_norm": 0.5995243191719055, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2490 + }, + { + "epoch": 3.2552083333333335, + "grad_norm": 0.5846129059791565, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 2500 + }, + { + "epoch": 3.2682291666666665, + "grad_norm": 0.5552570223808289, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 2510 + }, + { + "epoch": 3.28125, + "grad_norm": 0.576998233795166, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 2520 + }, + { + "epoch": 3.2942708333333335, + "grad_norm": 0.6526138186454773, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2530 + }, + { + "epoch": 3.3072916666666665, + "grad_norm": 0.6064265966415405, + "learning_rate": 0.0002, + "loss": 1.474, + "step": 2540 + }, + { + "epoch": 3.3203125, + "grad_norm": 0.5542362928390503, + "learning_rate": 0.0002, + "loss": 1.5125, + "step": 2550 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6048482060432434, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 2560 + }, + { + "epoch": 3.3463541666666665, + "grad_norm": 0.6328344941139221, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 2570 + }, + { + "epoch": 3.359375, + "grad_norm": 0.6347311735153198, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 2580 + }, + { + "epoch": 3.3723958333333335, + "grad_norm": 0.537570595741272, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 2590 + }, + { + "epoch": 3.3854166666666665, + "grad_norm": 0.5704807639122009, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 2600 + }, + { + "epoch": 3.3984375, + "grad_norm": 0.5914373993873596, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 2610 + }, + { + "epoch": 3.4114583333333335, + "grad_norm": 0.6724640130996704, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 2620 + }, + { + "epoch": 3.4244791666666665, + "grad_norm": 0.6295472383499146, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 2630 + }, + { + "epoch": 3.4375, + "grad_norm": 0.5842770934104919, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 2640 + }, + { + "epoch": 3.4505208333333335, + "grad_norm": 0.6297776699066162, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 2650 + }, + { + "epoch": 3.4635416666666665, + "grad_norm": 0.6105847358703613, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2660 + }, + { + "epoch": 3.4765625, + "grad_norm": 0.6294940710067749, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 2670 + }, + { + "epoch": 3.4895833333333335, + "grad_norm": 0.6573333740234375, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2680 + }, + { + "epoch": 3.5026041666666665, + "grad_norm": 0.663661539554596, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 2690 + }, + { + "epoch": 3.515625, + "grad_norm": 0.6729148626327515, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 2700 + }, + { + "epoch": 3.5286458333333335, + "grad_norm": 0.6633102893829346, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 2710 + }, + { + "epoch": 3.5416666666666665, + "grad_norm": 0.567686915397644, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 2720 + }, + { + "epoch": 3.5546875, + "grad_norm": 0.6281962394714355, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 2730 + }, + { + "epoch": 3.5677083333333335, + "grad_norm": 0.5710738897323608, + "learning_rate": 0.0002, + "loss": 1.5028, + "step": 2740 + }, + { + "epoch": 3.5807291666666665, + "grad_norm": 0.648162305355072, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 2750 + }, + { + "epoch": 3.59375, + "grad_norm": 0.5466254949569702, + "learning_rate": 0.0002, + "loss": 1.4294, + "step": 2760 + }, + { + "epoch": 3.6067708333333335, + "grad_norm": 0.6867973208427429, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2770 + }, + { + "epoch": 3.6197916666666665, + "grad_norm": 0.673612117767334, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2780 + }, + { + "epoch": 3.6328125, + "grad_norm": 0.6928417086601257, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2790 + }, + { + "epoch": 3.6458333333333335, + "grad_norm": 0.6603742837905884, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2800 + }, + { + "epoch": 3.6588541666666665, + "grad_norm": 0.5964401960372925, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 2810 + }, + { + "epoch": 3.671875, + "grad_norm": 0.6224474310874939, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 2820 + }, + { + "epoch": 3.6848958333333335, + "grad_norm": 0.6592439413070679, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 2830 + }, + { + "epoch": 3.6979166666666665, + "grad_norm": 0.6255369186401367, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 2840 + }, + { + "epoch": 3.7109375, + "grad_norm": 0.7136337757110596, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 2850 + }, + { + "epoch": 3.7239583333333335, + "grad_norm": 0.6229757070541382, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 2860 + }, + { + "epoch": 3.7369791666666665, + "grad_norm": 0.696080207824707, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2870 + }, + { + "epoch": 3.75, + "grad_norm": 0.571873664855957, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2880 + }, + { + "epoch": 3.7630208333333335, + "grad_norm": 0.5918916463851929, + "learning_rate": 0.0002, + "loss": 1.4093, + "step": 2890 + }, + { + "epoch": 3.7760416666666665, + "grad_norm": 0.616413950920105, + "learning_rate": 0.0002, + "loss": 1.399, + "step": 2900 + }, + { + "epoch": 3.7890625, + "grad_norm": 0.6267292499542236, + "learning_rate": 0.0002, + "loss": 1.4215, + "step": 2910 + }, + { + "epoch": 3.8020833333333335, + "grad_norm": 0.6630783677101135, + "learning_rate": 0.0002, + "loss": 1.5095, + "step": 2920 + }, + { + "epoch": 3.8151041666666665, + "grad_norm": 0.6004238724708557, + "learning_rate": 0.0002, + "loss": 1.5323, + "step": 2930 + }, + { + "epoch": 3.828125, + "grad_norm": 0.6740423440933228, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2940 + }, + { + "epoch": 3.8411458333333335, + "grad_norm": 0.6397785544395447, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 2950 + }, + { + "epoch": 3.8541666666666665, + "grad_norm": 0.6063735485076904, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2960 + }, + { + "epoch": 3.8671875, + "grad_norm": 0.6462053060531616, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 2970 + }, + { + "epoch": 3.8802083333333335, + "grad_norm": 0.7143250107765198, + "learning_rate": 0.0002, + "loss": 1.5237, + "step": 2980 + }, + { + "epoch": 3.8932291666666665, + "grad_norm": 0.6747874617576599, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2990 + }, + { + "epoch": 3.90625, + "grad_norm": 0.622930109500885, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 3000 + }, + { + "epoch": 3.9192708333333335, + "grad_norm": 0.620193600654602, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 3010 + }, + { + "epoch": 3.9322916666666665, + "grad_norm": 0.6321487426757812, + "learning_rate": 0.0002, + "loss": 1.495, + "step": 3020 + }, + { + "epoch": 3.9453125, + "grad_norm": 0.5705523490905762, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 3030 + }, + { + "epoch": 3.9583333333333335, + "grad_norm": 0.6185072660446167, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 3040 + }, + { + "epoch": 3.9713541666666665, + "grad_norm": 0.6005704998970032, + "learning_rate": 0.0002, + "loss": 1.4667, + "step": 3050 + }, + { + "epoch": 3.984375, + "grad_norm": 0.5933769941329956, + "learning_rate": 0.0002, + "loss": 1.4896, + "step": 3060 + }, + { + "epoch": 3.9973958333333335, + "grad_norm": 0.695209801197052, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 3070 + }, + { + "epoch": 4.0, + "eval_loss": 1.8955267667770386, + "eval_runtime": 103.5061, + "eval_samples_per_second": 4.976, + "eval_steps_per_second": 0.628, + "step": 3072 + }, + { + "epoch": 4.010416666666667, + "grad_norm": 0.6706188321113586, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 3080 + }, + { + "epoch": 4.0234375, + "grad_norm": 0.7263980507850647, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3090 + }, + { + "epoch": 4.036458333333333, + "grad_norm": 0.7767240405082703, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 3100 + }, + { + "epoch": 4.049479166666667, + "grad_norm": 0.6888399124145508, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 3110 + }, + { + "epoch": 4.0625, + "grad_norm": 0.8860331773757935, + "learning_rate": 0.0002, + "loss": 1.2422, + "step": 3120 + }, + { + "epoch": 4.075520833333333, + "grad_norm": 0.7572373151779175, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3130 + }, + { + "epoch": 4.088541666666667, + "grad_norm": 0.8321536183357239, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 3140 + }, + { + "epoch": 4.1015625, + "grad_norm": 0.7042664885520935, + "learning_rate": 0.0002, + "loss": 1.2843, + "step": 3150 + }, + { + "epoch": 4.114583333333333, + "grad_norm": 0.8910216689109802, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 3160 + }, + { + "epoch": 4.127604166666667, + "grad_norm": 0.8333232402801514, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3170 + }, + { + "epoch": 4.140625, + "grad_norm": 0.7120883464813232, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 3180 + }, + { + "epoch": 4.153645833333333, + "grad_norm": 0.6904631853103638, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 3190 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.6398878693580627, + "learning_rate": 0.0002, + "loss": 1.2881, + "step": 3200 + }, + { + "epoch": 4.1796875, + "grad_norm": 0.7573692798614502, + "learning_rate": 0.0002, + "loss": 1.3323, + "step": 3210 + }, + { + "epoch": 4.192708333333333, + "grad_norm": 0.7850743532180786, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 3220 + }, + { + "epoch": 4.205729166666667, + "grad_norm": 0.7863165736198425, + "learning_rate": 0.0002, + "loss": 1.3176, + "step": 3230 + }, + { + "epoch": 4.21875, + "grad_norm": 0.7855865359306335, + "learning_rate": 0.0002, + "loss": 1.3739, + "step": 3240 + }, + { + "epoch": 4.231770833333333, + "grad_norm": 0.6840922832489014, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 3250 + }, + { + "epoch": 4.244791666666667, + "grad_norm": 0.8499747514724731, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 3260 + }, + { + "epoch": 4.2578125, + "grad_norm": 0.7982883453369141, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 3270 + }, + { + "epoch": 4.270833333333333, + "grad_norm": 0.7776934504508972, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 3280 + }, + { + "epoch": 4.283854166666667, + "grad_norm": 0.8887693881988525, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 3290 + }, + { + "epoch": 4.296875, + "grad_norm": 1.0184714794158936, + "learning_rate": 0.0002, + "loss": 1.3213, + "step": 3300 + }, + { + "epoch": 4.309895833333333, + "grad_norm": 0.7539387345314026, + "learning_rate": 0.0002, + "loss": 1.3212, + "step": 3310 + }, + { + "epoch": 4.322916666666667, + "grad_norm": 0.8137491345405579, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 3320 + }, + { + "epoch": 4.3359375, + "grad_norm": 0.8136276006698608, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 3330 + }, + { + "epoch": 4.348958333333333, + "grad_norm": 0.7880964279174805, + "learning_rate": 0.0002, + "loss": 1.3512, + "step": 3340 + }, + { + "epoch": 4.361979166666667, + "grad_norm": 0.8654456734657288, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 3350 + }, + { + "epoch": 4.375, + "grad_norm": 0.8093366622924805, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3360 + }, + { + "epoch": 4.388020833333333, + "grad_norm": 0.8738575577735901, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 3370 + }, + { + "epoch": 4.401041666666667, + "grad_norm": 0.8923026919364929, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3380 + }, + { + "epoch": 4.4140625, + "grad_norm": 0.8508910536766052, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 3390 + }, + { + "epoch": 4.427083333333333, + "grad_norm": 0.8262084722518921, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 3400 + }, + { + "epoch": 4.440104166666667, + "grad_norm": 0.7843561768531799, + "learning_rate": 0.0002, + "loss": 1.3145, + "step": 3410 + }, + { + "epoch": 4.453125, + "grad_norm": 0.9087795615196228, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3420 + }, + { + "epoch": 4.466145833333333, + "grad_norm": 0.8278809189796448, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3430 + }, + { + "epoch": 4.479166666666667, + "grad_norm": 0.8337010741233826, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 3440 + }, + { + "epoch": 4.4921875, + "grad_norm": 0.7790088057518005, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 3450 + }, + { + "epoch": 4.505208333333333, + "grad_norm": 0.826231837272644, + "learning_rate": 0.0002, + "loss": 1.3344, + "step": 3460 + }, + { + "epoch": 4.518229166666667, + "grad_norm": 0.761461079120636, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 3470 + }, + { + "epoch": 4.53125, + "grad_norm": 0.8892785906791687, + "learning_rate": 0.0002, + "loss": 1.2829, + "step": 3480 + }, + { + "epoch": 4.544270833333333, + "grad_norm": 0.6087225675582886, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 3490 + }, + { + "epoch": 4.557291666666667, + "grad_norm": 0.8259274363517761, + "learning_rate": 0.0002, + "loss": 1.3167, + "step": 3500 + }, + { + "epoch": 4.5703125, + "grad_norm": 0.821164071559906, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3510 + }, + { + "epoch": 4.583333333333333, + "grad_norm": 0.7262887954711914, + "learning_rate": 0.0002, + "loss": 1.2853, + "step": 3520 + }, + { + "epoch": 4.596354166666667, + "grad_norm": 0.8564826250076294, + "learning_rate": 0.0002, + "loss": 1.3777, + "step": 3530 + }, + { + "epoch": 4.609375, + "grad_norm": 0.8072929978370667, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3540 + }, + { + "epoch": 4.622395833333333, + "grad_norm": 0.8040832877159119, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 3550 + }, + { + "epoch": 4.635416666666667, + "grad_norm": 0.7268754839897156, + "learning_rate": 0.0002, + "loss": 1.2863, + "step": 3560 + }, + { + "epoch": 4.6484375, + "grad_norm": 0.9985134601593018, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3570 + }, + { + "epoch": 4.661458333333333, + "grad_norm": 0.9826098680496216, + "learning_rate": 0.0002, + "loss": 1.3221, + "step": 3580 + }, + { + "epoch": 4.674479166666667, + "grad_norm": 0.8794422149658203, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 3590 + }, + { + "epoch": 4.6875, + "grad_norm": 0.7207489609718323, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3600 + }, + { + "epoch": 4.700520833333333, + "grad_norm": 0.7546059489250183, + "learning_rate": 0.0002, + "loss": 1.3192, + "step": 3610 + }, + { + "epoch": 4.713541666666667, + "grad_norm": 0.8318526148796082, + "learning_rate": 0.0002, + "loss": 1.3445, + "step": 3620 + }, + { + "epoch": 4.7265625, + "grad_norm": 0.7529309391975403, + "learning_rate": 0.0002, + "loss": 1.3847, + "step": 3630 + }, + { + "epoch": 4.739583333333333, + "grad_norm": 0.7762532234191895, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3640 + }, + { + "epoch": 4.752604166666667, + "grad_norm": 0.9306083917617798, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 3650 + }, + { + "epoch": 4.765625, + "grad_norm": 0.8050256967544556, + "learning_rate": 0.0002, + "loss": 1.3828, + "step": 3660 + }, + { + "epoch": 4.778645833333333, + "grad_norm": 0.8114449381828308, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3670 + }, + { + "epoch": 4.791666666666667, + "grad_norm": 0.8125811815261841, + "learning_rate": 0.0002, + "loss": 1.3296, + "step": 3680 + }, + { + "epoch": 4.8046875, + "grad_norm": 0.7642565369606018, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 3690 + }, + { + "epoch": 4.817708333333333, + "grad_norm": 0.8970131874084473, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3700 + }, + { + "epoch": 4.830729166666667, + "grad_norm": 0.7654327154159546, + "learning_rate": 0.0002, + "loss": 1.3983, + "step": 3710 + }, + { + "epoch": 4.84375, + "grad_norm": 0.7605378031730652, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 3720 + }, + { + "epoch": 4.856770833333333, + "grad_norm": 0.8340551257133484, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 3730 + }, + { + "epoch": 4.869791666666667, + "grad_norm": 0.7273691296577454, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 3740 + }, + { + "epoch": 4.8828125, + "grad_norm": 0.9718272686004639, + "learning_rate": 0.0002, + "loss": 1.3094, + "step": 3750 + }, + { + "epoch": 4.895833333333333, + "grad_norm": 0.7891847491264343, + "learning_rate": 0.0002, + "loss": 1.296, + "step": 3760 + }, + { + "epoch": 4.908854166666667, + "grad_norm": 0.9090818166732788, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 3770 + }, + { + "epoch": 4.921875, + "grad_norm": 0.7963318824768066, + "learning_rate": 0.0002, + "loss": 1.3478, + "step": 3780 + }, + { + "epoch": 4.934895833333333, + "grad_norm": 0.7588343620300293, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 3790 + }, + { + "epoch": 4.947916666666667, + "grad_norm": 0.84076327085495, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3800 + }, + { + "epoch": 4.9609375, + "grad_norm": 0.7767227292060852, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 3810 + }, + { + "epoch": 4.973958333333333, + "grad_norm": 0.8101866245269775, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3820 + }, + { + "epoch": 4.986979166666667, + "grad_norm": 0.7808696627616882, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 3830 + }, + { + "epoch": 5.0, + "grad_norm": 0.9609483480453491, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 3840 + }, + { + "epoch": 5.0, + "eval_loss": 1.9610719680786133, + "eval_runtime": 87.6572, + "eval_samples_per_second": 5.875, + "eval_steps_per_second": 0.742, + "step": 3840 + }, + { + "epoch": 5.013020833333333, + "grad_norm": 0.9366803765296936, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 3850 + }, + { + "epoch": 5.026041666666667, + "grad_norm": 0.8014302849769592, + "learning_rate": 0.0002, + "loss": 1.1931, + "step": 3860 + }, + { + "epoch": 5.0390625, + "grad_norm": 0.977936863899231, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 3870 + }, + { + "epoch": 5.052083333333333, + "grad_norm": 1.045047640800476, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 3880 + }, + { + "epoch": 5.065104166666667, + "grad_norm": 1.125620722770691, + "learning_rate": 0.0002, + "loss": 1.1709, + "step": 3890 + }, + { + "epoch": 5.078125, + "grad_norm": 1.1565124988555908, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 3900 + }, + { + "epoch": 5.091145833333333, + "grad_norm": 1.102354884147644, + "learning_rate": 0.0002, + "loss": 1.1753, + "step": 3910 + }, + { + "epoch": 5.104166666666667, + "grad_norm": 0.9567629098892212, + "learning_rate": 0.0002, + "loss": 1.1632, + "step": 3920 + }, + { + "epoch": 5.1171875, + "grad_norm": 0.9760252833366394, + "learning_rate": 0.0002, + "loss": 1.1875, + "step": 3930 + }, + { + "epoch": 5.130208333333333, + "grad_norm": 1.026168704032898, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 3940 + }, + { + "epoch": 5.143229166666667, + "grad_norm": 1.1490436792373657, + "learning_rate": 0.0002, + "loss": 1.1598, + "step": 3950 + }, + { + "epoch": 5.15625, + "grad_norm": 0.9712087512016296, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 3960 + }, + { + "epoch": 5.169270833333333, + "grad_norm": 1.0095003843307495, + "learning_rate": 0.0002, + "loss": 1.1948, + "step": 3970 + }, + { + "epoch": 5.182291666666667, + "grad_norm": 0.9171855449676514, + "learning_rate": 0.0002, + "loss": 1.1617, + "step": 3980 + }, + { + "epoch": 5.1953125, + "grad_norm": 1.0105657577514648, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 3990 + }, + { + "epoch": 5.208333333333333, + "grad_norm": 1.0330145359039307, + "learning_rate": 0.0002, + "loss": 1.2098, + "step": 4000 + }, + { + "epoch": 5.221354166666667, + "grad_norm": 1.0676906108856201, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4010 + }, + { + "epoch": 5.234375, + "grad_norm": 1.055088758468628, + "learning_rate": 0.0002, + "loss": 1.1392, + "step": 4020 + }, + { + "epoch": 5.247395833333333, + "grad_norm": 0.9523683786392212, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 4030 + }, + { + "epoch": 5.260416666666667, + "grad_norm": 0.9013799428939819, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 4040 + }, + { + "epoch": 5.2734375, + "grad_norm": 0.9379037618637085, + "learning_rate": 0.0002, + "loss": 1.2274, + "step": 4050 + }, + { + "epoch": 5.286458333333333, + "grad_norm": 0.9565327763557434, + "learning_rate": 0.0002, + "loss": 1.1246, + "step": 4060 + }, + { + "epoch": 5.299479166666667, + "grad_norm": 1.1994404792785645, + "learning_rate": 0.0002, + "loss": 1.2103, + "step": 4070 + }, + { + "epoch": 5.3125, + "grad_norm": 1.0563262701034546, + "learning_rate": 0.0002, + "loss": 1.2016, + "step": 4080 + }, + { + "epoch": 5.325520833333333, + "grad_norm": 1.024290680885315, + "learning_rate": 0.0002, + "loss": 1.2478, + "step": 4090 + }, + { + "epoch": 5.338541666666667, + "grad_norm": 1.0022907257080078, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 4100 + }, + { + "epoch": 5.3515625, + "grad_norm": 0.9642180800437927, + "learning_rate": 0.0002, + "loss": 1.1948, + "step": 4110 + }, + { + "epoch": 5.364583333333333, + "grad_norm": 1.0228009223937988, + "learning_rate": 0.0002, + "loss": 1.231, + "step": 4120 + }, + { + "epoch": 5.377604166666667, + "grad_norm": 1.0379719734191895, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4130 + }, + { + "epoch": 5.390625, + "grad_norm": 1.147053599357605, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 4140 + }, + { + "epoch": 5.403645833333333, + "grad_norm": 1.2097876071929932, + "learning_rate": 0.0002, + "loss": 1.2026, + "step": 4150 + }, + { + "epoch": 5.416666666666667, + "grad_norm": 1.0852497816085815, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 4160 + }, + { + "epoch": 5.4296875, + "grad_norm": 0.9765135645866394, + "learning_rate": 0.0002, + "loss": 1.2182, + "step": 4170 + }, + { + "epoch": 5.442708333333333, + "grad_norm": 1.0180606842041016, + "learning_rate": 0.0002, + "loss": 1.3117, + "step": 4180 + }, + { + "epoch": 5.455729166666667, + "grad_norm": 1.185409665107727, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 4190 + }, + { + "epoch": 5.46875, + "grad_norm": 0.9363358020782471, + "learning_rate": 0.0002, + "loss": 1.1531, + "step": 4200 + }, + { + "epoch": 5.481770833333333, + "grad_norm": 1.0761215686798096, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 4210 + }, + { + "epoch": 5.494791666666667, + "grad_norm": 1.057626724243164, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 4220 + }, + { + "epoch": 5.5078125, + "grad_norm": 1.0103157758712769, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4230 + }, + { + "epoch": 5.520833333333333, + "grad_norm": 1.1056627035140991, + "learning_rate": 0.0002, + "loss": 1.2193, + "step": 4240 + }, + { + "epoch": 5.533854166666667, + "grad_norm": 1.0256257057189941, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 4250 + }, + { + "epoch": 5.546875, + "grad_norm": 1.2814106941223145, + "learning_rate": 0.0002, + "loss": 1.1724, + "step": 4260 + }, + { + "epoch": 5.559895833333333, + "grad_norm": 0.9044927954673767, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 4270 + }, + { + "epoch": 5.572916666666667, + "grad_norm": 0.9870165586471558, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 4280 + }, + { + "epoch": 5.5859375, + "grad_norm": 0.9867369532585144, + "learning_rate": 0.0002, + "loss": 1.2414, + "step": 4290 + }, + { + "epoch": 5.598958333333333, + "grad_norm": 1.045625925064087, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4300 + }, + { + "epoch": 5.611979166666667, + "grad_norm": 0.979853630065918, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 4310 + }, + { + "epoch": 5.625, + "grad_norm": 1.029212236404419, + "learning_rate": 0.0002, + "loss": 1.1629, + "step": 4320 + }, + { + "epoch": 5.638020833333333, + "grad_norm": 1.0348633527755737, + "learning_rate": 0.0002, + "loss": 1.1985, + "step": 4330 + }, + { + "epoch": 5.651041666666667, + "grad_norm": 1.0055185556411743, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 4340 + }, + { + "epoch": 5.6640625, + "grad_norm": 0.9312447309494019, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 4350 + }, + { + "epoch": 5.677083333333333, + "grad_norm": 1.1411694288253784, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 4360 + }, + { + "epoch": 5.690104166666667, + "grad_norm": 0.9764434695243835, + "learning_rate": 0.0002, + "loss": 1.2679, + "step": 4370 + }, + { + "epoch": 5.703125, + "grad_norm": 1.079154133796692, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 4380 + }, + { + "epoch": 5.716145833333333, + "grad_norm": 0.999526858329773, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 4390 + }, + { + "epoch": 5.729166666666667, + "grad_norm": 1.1239734888076782, + "learning_rate": 0.0002, + "loss": 1.1685, + "step": 4400 + }, + { + "epoch": 5.7421875, + "grad_norm": 1.0539512634277344, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 4410 + }, + { + "epoch": 5.755208333333333, + "grad_norm": 0.9884052872657776, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4420 + }, + { + "epoch": 5.768229166666667, + "grad_norm": 0.9821958541870117, + "learning_rate": 0.0002, + "loss": 1.1781, + "step": 4430 + }, + { + "epoch": 5.78125, + "grad_norm": 0.9340839982032776, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 4440 + }, + { + "epoch": 5.794270833333333, + "grad_norm": 0.9935781955718994, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 4450 + }, + { + "epoch": 5.807291666666667, + "grad_norm": 1.1027121543884277, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 4460 + }, + { + "epoch": 5.8203125, + "grad_norm": 0.9388337135314941, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 4470 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 1.0957310199737549, + "learning_rate": 0.0002, + "loss": 1.259, + "step": 4480 + }, + { + "epoch": 5.846354166666667, + "grad_norm": 1.0832754373550415, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 4490 + }, + { + "epoch": 5.859375, + "grad_norm": 0.9498379826545715, + "learning_rate": 0.0002, + "loss": 1.1724, + "step": 4500 + }, + { + "epoch": 5.872395833333333, + "grad_norm": 0.9104725122451782, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 4510 + }, + { + "epoch": 5.885416666666667, + "grad_norm": 1.2238177061080933, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 4520 + }, + { + "epoch": 5.8984375, + "grad_norm": 1.0549527406692505, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 4530 + }, + { + "epoch": 5.911458333333333, + "grad_norm": 1.0415066480636597, + "learning_rate": 0.0002, + "loss": 1.3086, + "step": 4540 + }, + { + "epoch": 5.924479166666667, + "grad_norm": 0.9098646640777588, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 4550 + }, + { + "epoch": 5.9375, + "grad_norm": 0.9182857275009155, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 4560 + }, + { + "epoch": 5.950520833333333, + "grad_norm": 1.088038444519043, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4570 + }, + { + "epoch": 5.963541666666667, + "grad_norm": 1.1331020593643188, + "learning_rate": 0.0002, + "loss": 1.2317, + "step": 4580 + }, + { + "epoch": 5.9765625, + "grad_norm": 0.9592235088348389, + "learning_rate": 0.0002, + "loss": 1.2318, + "step": 4590 + }, + { + "epoch": 5.989583333333333, + "grad_norm": 1.0126368999481201, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 4600 + }, + { + "epoch": 6.0, + "eval_loss": 2.096651315689087, + "eval_runtime": 43.1936, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 1.505, + "step": 4608 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.132479222701097e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-4608/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..608e97b18137fc55c344219caa02693218b3d406 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59d719ca30ad7a90153130a27a26874ab470e301469c9234322b99c5a32388e +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fe9b7ef6274ec85055b01f926ab7d1c0ee1a37d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:108190c0a47e46d317b0ab0838e861d2f62b588b360a0f7030b80d3cfe374a4f +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..434694143a8317dfc982a28379351811c5656391 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f75b257cb06979b89ce8e974ee3b3e5fabd6e576d43fb7da6e4a1a99bcaba76 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3cf7c2488a1d0e435fb77fe6f4a534194629649 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8fac21404f46a4dc8116dc1d6f353dc19ebb8014de829697aa5453e170307f5 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ce80ec1dec8da50f01b44c83d7c7cf93c6da6f57 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/trainer_state.json @@ -0,0 +1,3848 @@ +{ + "best_metric": 1.820037841796875, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 5376, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 0.303970068693161, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 770 + }, + { + "epoch": 1.015625, + "grad_norm": 0.32745876908302307, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 780 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 0.33467888832092285, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.38253068923950195, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 800 + }, + { + "epoch": 1.0546875, + "grad_norm": 0.3955802023410797, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 810 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.3534117043018341, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 820 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 0.33427858352661133, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 830 + }, + { + "epoch": 1.09375, + "grad_norm": 0.35261571407318115, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 0.4416263997554779, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 850 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.3918050229549408, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 860 + }, + { + "epoch": 1.1328125, + "grad_norm": 0.38482677936553955, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 870 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.4945143759250641, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 880 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 0.429677814245224, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 890 + }, + { + "epoch": 1.171875, + "grad_norm": 0.41878288984298706, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 900 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 0.41578373312950134, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 910 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.37028902769088745, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 920 + }, + { + "epoch": 1.2109375, + "grad_norm": 0.3824995756149292, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 930 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.3818865418434143, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 940 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 0.3930460810661316, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 950 + }, + { + "epoch": 1.25, + "grad_norm": 0.3904426395893097, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 960 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 0.4175802171230316, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 970 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.42343786358833313, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 980 + }, + { + "epoch": 1.2890625, + "grad_norm": 0.4168420135974884, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 990 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.38692983984947205, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1000 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 0.5037692189216614, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 1010 + }, + { + "epoch": 1.328125, + "grad_norm": 0.39436691999435425, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 1020 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 0.3431943356990814, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1030 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.39167070388793945, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 1040 + }, + { + "epoch": 1.3671875, + "grad_norm": 0.3820446729660034, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1050 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.4190749526023865, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1060 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 0.3618869185447693, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1070 + }, + { + "epoch": 1.40625, + "grad_norm": 0.38852423429489136, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1080 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 0.49829256534576416, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 1090 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.3956700563430786, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 1100 + }, + { + "epoch": 1.4453125, + "grad_norm": 0.38829147815704346, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1110 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.37237483263015747, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 1120 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 0.39798808097839355, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 1.484375, + "grad_norm": 0.38188642263412476, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 1140 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 0.44961944222450256, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1150 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.3816550374031067, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 1160 + }, + { + "epoch": 1.5234375, + "grad_norm": 0.3885478973388672, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1170 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.42779695987701416, + "learning_rate": 0.0002, + "loss": 1.7285, + "step": 1180 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 0.41499748826026917, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 1190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4319412410259247, + "learning_rate": 0.0002, + "loss": 1.6569, + "step": 1200 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 0.38847389817237854, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 1210 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.45832890272140503, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 1220 + }, + { + "epoch": 1.6015625, + "grad_norm": 0.45928797125816345, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 1230 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.4052276611328125, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 0.4031650424003601, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 1250 + }, + { + "epoch": 1.640625, + "grad_norm": 0.36724114418029785, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1260 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 0.4188505709171295, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 1270 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3982168138027191, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 1280 + }, + { + "epoch": 1.6796875, + "grad_norm": 0.3768596053123474, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1290 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.3843287527561188, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1300 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 0.3982345461845398, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1310 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3407546281814575, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 1320 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 0.36327359080314636, + "learning_rate": 0.0002, + "loss": 1.7316, + "step": 1330 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.4141675531864166, + "learning_rate": 0.0002, + "loss": 1.734, + "step": 1340 + }, + { + "epoch": 1.7578125, + "grad_norm": 0.43894267082214355, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1350 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.40564292669296265, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 1360 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 0.3978462815284729, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1370 + }, + { + "epoch": 1.796875, + "grad_norm": 0.37140771746635437, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1380 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 0.43164145946502686, + "learning_rate": 0.0002, + "loss": 1.742, + "step": 1390 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.38034674525260925, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1400 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1410 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.37417489290237427, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1420 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 0.4303789734840393, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1430 + }, + { + "epoch": 1.875, + "grad_norm": 0.43942129611968994, + "learning_rate": 0.0002, + "loss": 1.6489, + "step": 1440 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 0.3866581320762634, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 1450 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3686903417110443, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1460 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.3885461986064911, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 1470 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.4156927466392517, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1480 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 0.3934236168861389, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 1490 + }, + { + "epoch": 1.953125, + "grad_norm": 0.38645586371421814, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 1500 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 0.43272635340690613, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1510 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.42476025223731995, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1520 + }, + { + "epoch": 1.9921875, + "grad_norm": 0.37216147780418396, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1530 + }, + { + "epoch": 2.0, + "eval_loss": 1.820037841796875, + "eval_runtime": 101.0456, + "eval_samples_per_second": 5.097, + "eval_steps_per_second": 0.643, + "step": 1536 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 0.39003029465675354, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1540 + }, + { + "epoch": 2.0182291666666665, + "grad_norm": 0.4302637577056885, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 1550 + }, + { + "epoch": 2.03125, + "grad_norm": 0.4496043026447296, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1560 + }, + { + "epoch": 2.0442708333333335, + "grad_norm": 0.42824679613113403, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1570 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 0.44775739312171936, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 1580 + }, + { + "epoch": 2.0703125, + "grad_norm": 0.4705299735069275, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1590 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.4614814817905426, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1600 + }, + { + "epoch": 2.0963541666666665, + "grad_norm": 0.45097213983535767, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1610 + }, + { + "epoch": 2.109375, + "grad_norm": 0.41954323649406433, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 1620 + }, + { + "epoch": 2.1223958333333335, + "grad_norm": 0.44894352555274963, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 1630 + }, + { + "epoch": 2.1354166666666665, + "grad_norm": 0.4421502947807312, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1640 + }, + { + "epoch": 2.1484375, + "grad_norm": 0.44649967551231384, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1650 + }, + { + "epoch": 2.1614583333333335, + "grad_norm": 0.44216716289520264, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 1660 + }, + { + "epoch": 2.1744791666666665, + "grad_norm": 0.6363232135772705, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 1670 + }, + { + "epoch": 2.1875, + "grad_norm": 0.46533334255218506, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1680 + }, + { + "epoch": 2.2005208333333335, + "grad_norm": 0.48486822843551636, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 1690 + }, + { + "epoch": 2.2135416666666665, + "grad_norm": 0.43277066946029663, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 1700 + }, + { + "epoch": 2.2265625, + "grad_norm": 0.45927226543426514, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 1710 + }, + { + "epoch": 2.2395833333333335, + "grad_norm": 0.4654010236263275, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 1720 + }, + { + "epoch": 2.2526041666666665, + "grad_norm": 0.49796584248542786, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 1730 + }, + { + "epoch": 2.265625, + "grad_norm": 0.4506736397743225, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1740 + }, + { + "epoch": 2.2786458333333335, + "grad_norm": 0.46757954359054565, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1750 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.4507335424423218, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 1760 + }, + { + "epoch": 2.3046875, + "grad_norm": 0.43900197744369507, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1770 + }, + { + "epoch": 2.3177083333333335, + "grad_norm": 0.48013004660606384, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1780 + }, + { + "epoch": 2.3307291666666665, + "grad_norm": 0.41891220211982727, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1790 + }, + { + "epoch": 2.34375, + "grad_norm": 0.4879191219806671, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1800 + }, + { + "epoch": 2.3567708333333335, + "grad_norm": 0.46148231625556946, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1810 + }, + { + "epoch": 2.3697916666666665, + "grad_norm": 0.5114223957061768, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 1820 + }, + { + "epoch": 2.3828125, + "grad_norm": 0.4828612804412842, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 1830 + }, + { + "epoch": 2.3958333333333335, + "grad_norm": 0.4672335386276245, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1840 + }, + { + "epoch": 2.4088541666666665, + "grad_norm": 0.4914792776107788, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1850 + }, + { + "epoch": 2.421875, + "grad_norm": 0.44478079676628113, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 1860 + }, + { + "epoch": 2.4348958333333335, + "grad_norm": 0.4601325988769531, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 1870 + }, + { + "epoch": 2.4479166666666665, + "grad_norm": 0.44539815187454224, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 1880 + }, + { + "epoch": 2.4609375, + "grad_norm": 0.4532422125339508, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 1890 + }, + { + "epoch": 2.4739583333333335, + "grad_norm": 0.5323562622070312, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 1900 + }, + { + "epoch": 2.4869791666666665, + "grad_norm": 0.5027516484260559, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1910 + }, + { + "epoch": 2.5, + "grad_norm": 0.4507808983325958, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 1920 + }, + { + "epoch": 2.5130208333333335, + "grad_norm": 0.4996422827243805, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1930 + }, + { + "epoch": 2.5260416666666665, + "grad_norm": 0.4964800179004669, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 1940 + }, + { + "epoch": 2.5390625, + "grad_norm": 0.48546481132507324, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 1950 + }, + { + "epoch": 2.5520833333333335, + "grad_norm": 0.47357916831970215, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1960 + }, + { + "epoch": 2.5651041666666665, + "grad_norm": 0.47136595845222473, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 1970 + }, + { + "epoch": 2.578125, + "grad_norm": 0.5185502171516418, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 1980 + }, + { + "epoch": 2.5911458333333335, + "grad_norm": 0.47995880246162415, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 1990 + }, + { + "epoch": 2.6041666666666665, + "grad_norm": 0.5076674222946167, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2000 + }, + { + "epoch": 2.6171875, + "grad_norm": 0.4805421233177185, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2010 + }, + { + "epoch": 2.6302083333333335, + "grad_norm": 0.4406864047050476, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2020 + }, + { + "epoch": 2.6432291666666665, + "grad_norm": 0.521388828754425, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2030 + }, + { + "epoch": 2.65625, + "grad_norm": 0.4531918466091156, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 2040 + }, + { + "epoch": 2.6692708333333335, + "grad_norm": 0.45295774936676025, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2050 + }, + { + "epoch": 2.6822916666666665, + "grad_norm": 0.4573723375797272, + "learning_rate": 0.0002, + "loss": 1.5252, + "step": 2060 + }, + { + "epoch": 2.6953125, + "grad_norm": 0.4836064279079437, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2070 + }, + { + "epoch": 2.7083333333333335, + "grad_norm": 0.5040885210037231, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2080 + }, + { + "epoch": 2.7213541666666665, + "grad_norm": 0.5153458118438721, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2090 + }, + { + "epoch": 2.734375, + "grad_norm": 0.4415692090988159, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2100 + }, + { + "epoch": 2.7473958333333335, + "grad_norm": 0.4862712621688843, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2110 + }, + { + "epoch": 2.7604166666666665, + "grad_norm": 0.4845922589302063, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2120 + }, + { + "epoch": 2.7734375, + "grad_norm": 0.5153566598892212, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 2130 + }, + { + "epoch": 2.7864583333333335, + "grad_norm": 0.4220491945743561, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 2140 + }, + { + "epoch": 2.7994791666666665, + "grad_norm": 0.523292064666748, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 2150 + }, + { + "epoch": 2.8125, + "grad_norm": 0.4567972421646118, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2160 + }, + { + "epoch": 2.8255208333333335, + "grad_norm": 0.6252557039260864, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2170 + }, + { + "epoch": 2.8385416666666665, + "grad_norm": 0.5231373310089111, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 2180 + }, + { + "epoch": 2.8515625, + "grad_norm": 0.49243974685668945, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 2190 + }, + { + "epoch": 2.8645833333333335, + "grad_norm": 0.521644115447998, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2200 + }, + { + "epoch": 2.8776041666666665, + "grad_norm": 0.4624195694923401, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2210 + }, + { + "epoch": 2.890625, + "grad_norm": 0.4463620185852051, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2220 + }, + { + "epoch": 2.9036458333333335, + "grad_norm": 0.45793524384498596, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 2230 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.46979188919067383, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2240 + }, + { + "epoch": 2.9296875, + "grad_norm": 0.5220303535461426, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2250 + }, + { + "epoch": 2.9427083333333335, + "grad_norm": 0.44405895471572876, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2260 + }, + { + "epoch": 2.9557291666666665, + "grad_norm": 0.523841381072998, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 2270 + }, + { + "epoch": 2.96875, + "grad_norm": 0.4928138852119446, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2280 + }, + { + "epoch": 2.9817708333333335, + "grad_norm": 0.4918071925640106, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 2290 + }, + { + "epoch": 2.9947916666666665, + "grad_norm": 0.4584912061691284, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2300 + }, + { + "epoch": 3.0, + "eval_loss": 1.8474308252334595, + "eval_runtime": 103.7697, + "eval_samples_per_second": 4.963, + "eval_steps_per_second": 0.626, + "step": 2304 + }, + { + "epoch": 3.0078125, + "grad_norm": 0.4801871180534363, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 2310 + }, + { + "epoch": 3.0208333333333335, + "grad_norm": 0.5789998173713684, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 2320 + }, + { + "epoch": 3.0338541666666665, + "grad_norm": 0.49856704473495483, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2330 + }, + { + "epoch": 3.046875, + "grad_norm": 0.5625631213188171, + "learning_rate": 0.0002, + "loss": 1.4718, + "step": 2340 + }, + { + "epoch": 3.0598958333333335, + "grad_norm": 0.557637095451355, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 2350 + }, + { + "epoch": 3.0729166666666665, + "grad_norm": 0.528889536857605, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 2360 + }, + { + "epoch": 3.0859375, + "grad_norm": 0.5952284932136536, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 2370 + }, + { + "epoch": 3.0989583333333335, + "grad_norm": 0.5549899339675903, + "learning_rate": 0.0002, + "loss": 1.5304, + "step": 2380 + }, + { + "epoch": 3.1119791666666665, + "grad_norm": 0.662139892578125, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 2390 + }, + { + "epoch": 3.125, + "grad_norm": 0.5281530618667603, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 2400 + }, + { + "epoch": 3.1380208333333335, + "grad_norm": 0.6134106516838074, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 2410 + }, + { + "epoch": 3.1510416666666665, + "grad_norm": 0.6040887236595154, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2420 + }, + { + "epoch": 3.1640625, + "grad_norm": 0.549672544002533, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 2430 + }, + { + "epoch": 3.1770833333333335, + "grad_norm": 0.9195653796195984, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 2440 + }, + { + "epoch": 3.1901041666666665, + "grad_norm": 0.5578703284263611, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 2450 + }, + { + "epoch": 3.203125, + "grad_norm": 0.5982925891876221, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 2460 + }, + { + "epoch": 3.2161458333333335, + "grad_norm": 0.5544393062591553, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 2470 + }, + { + "epoch": 3.2291666666666665, + "grad_norm": 0.6015266180038452, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2480 + }, + { + "epoch": 3.2421875, + "grad_norm": 0.5995243191719055, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2490 + }, + { + "epoch": 3.2552083333333335, + "grad_norm": 0.5846129059791565, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 2500 + }, + { + "epoch": 3.2682291666666665, + "grad_norm": 0.5552570223808289, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 2510 + }, + { + "epoch": 3.28125, + "grad_norm": 0.576998233795166, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 2520 + }, + { + "epoch": 3.2942708333333335, + "grad_norm": 0.6526138186454773, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2530 + }, + { + "epoch": 3.3072916666666665, + "grad_norm": 0.6064265966415405, + "learning_rate": 0.0002, + "loss": 1.474, + "step": 2540 + }, + { + "epoch": 3.3203125, + "grad_norm": 0.5542362928390503, + "learning_rate": 0.0002, + "loss": 1.5125, + "step": 2550 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6048482060432434, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 2560 + }, + { + "epoch": 3.3463541666666665, + "grad_norm": 0.6328344941139221, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 2570 + }, + { + "epoch": 3.359375, + "grad_norm": 0.6347311735153198, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 2580 + }, + { + "epoch": 3.3723958333333335, + "grad_norm": 0.537570595741272, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 2590 + }, + { + "epoch": 3.3854166666666665, + "grad_norm": 0.5704807639122009, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 2600 + }, + { + "epoch": 3.3984375, + "grad_norm": 0.5914373993873596, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 2610 + }, + { + "epoch": 3.4114583333333335, + "grad_norm": 0.6724640130996704, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 2620 + }, + { + "epoch": 3.4244791666666665, + "grad_norm": 0.6295472383499146, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 2630 + }, + { + "epoch": 3.4375, + "grad_norm": 0.5842770934104919, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 2640 + }, + { + "epoch": 3.4505208333333335, + "grad_norm": 0.6297776699066162, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 2650 + }, + { + "epoch": 3.4635416666666665, + "grad_norm": 0.6105847358703613, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2660 + }, + { + "epoch": 3.4765625, + "grad_norm": 0.6294940710067749, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 2670 + }, + { + "epoch": 3.4895833333333335, + "grad_norm": 0.6573333740234375, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2680 + }, + { + "epoch": 3.5026041666666665, + "grad_norm": 0.663661539554596, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 2690 + }, + { + "epoch": 3.515625, + "grad_norm": 0.6729148626327515, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 2700 + }, + { + "epoch": 3.5286458333333335, + "grad_norm": 0.6633102893829346, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 2710 + }, + { + "epoch": 3.5416666666666665, + "grad_norm": 0.567686915397644, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 2720 + }, + { + "epoch": 3.5546875, + "grad_norm": 0.6281962394714355, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 2730 + }, + { + "epoch": 3.5677083333333335, + "grad_norm": 0.5710738897323608, + "learning_rate": 0.0002, + "loss": 1.5028, + "step": 2740 + }, + { + "epoch": 3.5807291666666665, + "grad_norm": 0.648162305355072, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 2750 + }, + { + "epoch": 3.59375, + "grad_norm": 0.5466254949569702, + "learning_rate": 0.0002, + "loss": 1.4294, + "step": 2760 + }, + { + "epoch": 3.6067708333333335, + "grad_norm": 0.6867973208427429, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2770 + }, + { + "epoch": 3.6197916666666665, + "grad_norm": 0.673612117767334, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2780 + }, + { + "epoch": 3.6328125, + "grad_norm": 0.6928417086601257, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2790 + }, + { + "epoch": 3.6458333333333335, + "grad_norm": 0.6603742837905884, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2800 + }, + { + "epoch": 3.6588541666666665, + "grad_norm": 0.5964401960372925, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 2810 + }, + { + "epoch": 3.671875, + "grad_norm": 0.6224474310874939, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 2820 + }, + { + "epoch": 3.6848958333333335, + "grad_norm": 0.6592439413070679, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 2830 + }, + { + "epoch": 3.6979166666666665, + "grad_norm": 0.6255369186401367, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 2840 + }, + { + "epoch": 3.7109375, + "grad_norm": 0.7136337757110596, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 2850 + }, + { + "epoch": 3.7239583333333335, + "grad_norm": 0.6229757070541382, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 2860 + }, + { + "epoch": 3.7369791666666665, + "grad_norm": 0.696080207824707, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2870 + }, + { + "epoch": 3.75, + "grad_norm": 0.571873664855957, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2880 + }, + { + "epoch": 3.7630208333333335, + "grad_norm": 0.5918916463851929, + "learning_rate": 0.0002, + "loss": 1.4093, + "step": 2890 + }, + { + "epoch": 3.7760416666666665, + "grad_norm": 0.616413950920105, + "learning_rate": 0.0002, + "loss": 1.399, + "step": 2900 + }, + { + "epoch": 3.7890625, + "grad_norm": 0.6267292499542236, + "learning_rate": 0.0002, + "loss": 1.4215, + "step": 2910 + }, + { + "epoch": 3.8020833333333335, + "grad_norm": 0.6630783677101135, + "learning_rate": 0.0002, + "loss": 1.5095, + "step": 2920 + }, + { + "epoch": 3.8151041666666665, + "grad_norm": 0.6004238724708557, + "learning_rate": 0.0002, + "loss": 1.5323, + "step": 2930 + }, + { + "epoch": 3.828125, + "grad_norm": 0.6740423440933228, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2940 + }, + { + "epoch": 3.8411458333333335, + "grad_norm": 0.6397785544395447, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 2950 + }, + { + "epoch": 3.8541666666666665, + "grad_norm": 0.6063735485076904, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2960 + }, + { + "epoch": 3.8671875, + "grad_norm": 0.6462053060531616, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 2970 + }, + { + "epoch": 3.8802083333333335, + "grad_norm": 0.7143250107765198, + "learning_rate": 0.0002, + "loss": 1.5237, + "step": 2980 + }, + { + "epoch": 3.8932291666666665, + "grad_norm": 0.6747874617576599, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2990 + }, + { + "epoch": 3.90625, + "grad_norm": 0.622930109500885, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 3000 + }, + { + "epoch": 3.9192708333333335, + "grad_norm": 0.620193600654602, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 3010 + }, + { + "epoch": 3.9322916666666665, + "grad_norm": 0.6321487426757812, + "learning_rate": 0.0002, + "loss": 1.495, + "step": 3020 + }, + { + "epoch": 3.9453125, + "grad_norm": 0.5705523490905762, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 3030 + }, + { + "epoch": 3.9583333333333335, + "grad_norm": 0.6185072660446167, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 3040 + }, + { + "epoch": 3.9713541666666665, + "grad_norm": 0.6005704998970032, + "learning_rate": 0.0002, + "loss": 1.4667, + "step": 3050 + }, + { + "epoch": 3.984375, + "grad_norm": 0.5933769941329956, + "learning_rate": 0.0002, + "loss": 1.4896, + "step": 3060 + }, + { + "epoch": 3.9973958333333335, + "grad_norm": 0.695209801197052, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 3070 + }, + { + "epoch": 4.0, + "eval_loss": 1.8955267667770386, + "eval_runtime": 103.5061, + "eval_samples_per_second": 4.976, + "eval_steps_per_second": 0.628, + "step": 3072 + }, + { + "epoch": 4.010416666666667, + "grad_norm": 0.6706188321113586, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 3080 + }, + { + "epoch": 4.0234375, + "grad_norm": 0.7263980507850647, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3090 + }, + { + "epoch": 4.036458333333333, + "grad_norm": 0.7767240405082703, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 3100 + }, + { + "epoch": 4.049479166666667, + "grad_norm": 0.6888399124145508, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 3110 + }, + { + "epoch": 4.0625, + "grad_norm": 0.8860331773757935, + "learning_rate": 0.0002, + "loss": 1.2422, + "step": 3120 + }, + { + "epoch": 4.075520833333333, + "grad_norm": 0.7572373151779175, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3130 + }, + { + "epoch": 4.088541666666667, + "grad_norm": 0.8321536183357239, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 3140 + }, + { + "epoch": 4.1015625, + "grad_norm": 0.7042664885520935, + "learning_rate": 0.0002, + "loss": 1.2843, + "step": 3150 + }, + { + "epoch": 4.114583333333333, + "grad_norm": 0.8910216689109802, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 3160 + }, + { + "epoch": 4.127604166666667, + "grad_norm": 0.8333232402801514, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3170 + }, + { + "epoch": 4.140625, + "grad_norm": 0.7120883464813232, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 3180 + }, + { + "epoch": 4.153645833333333, + "grad_norm": 0.6904631853103638, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 3190 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.6398878693580627, + "learning_rate": 0.0002, + "loss": 1.2881, + "step": 3200 + }, + { + "epoch": 4.1796875, + "grad_norm": 0.7573692798614502, + "learning_rate": 0.0002, + "loss": 1.3323, + "step": 3210 + }, + { + "epoch": 4.192708333333333, + "grad_norm": 0.7850743532180786, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 3220 + }, + { + "epoch": 4.205729166666667, + "grad_norm": 0.7863165736198425, + "learning_rate": 0.0002, + "loss": 1.3176, + "step": 3230 + }, + { + "epoch": 4.21875, + "grad_norm": 0.7855865359306335, + "learning_rate": 0.0002, + "loss": 1.3739, + "step": 3240 + }, + { + "epoch": 4.231770833333333, + "grad_norm": 0.6840922832489014, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 3250 + }, + { + "epoch": 4.244791666666667, + "grad_norm": 0.8499747514724731, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 3260 + }, + { + "epoch": 4.2578125, + "grad_norm": 0.7982883453369141, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 3270 + }, + { + "epoch": 4.270833333333333, + "grad_norm": 0.7776934504508972, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 3280 + }, + { + "epoch": 4.283854166666667, + "grad_norm": 0.8887693881988525, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 3290 + }, + { + "epoch": 4.296875, + "grad_norm": 1.0184714794158936, + "learning_rate": 0.0002, + "loss": 1.3213, + "step": 3300 + }, + { + "epoch": 4.309895833333333, + "grad_norm": 0.7539387345314026, + "learning_rate": 0.0002, + "loss": 1.3212, + "step": 3310 + }, + { + "epoch": 4.322916666666667, + "grad_norm": 0.8137491345405579, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 3320 + }, + { + "epoch": 4.3359375, + "grad_norm": 0.8136276006698608, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 3330 + }, + { + "epoch": 4.348958333333333, + "grad_norm": 0.7880964279174805, + "learning_rate": 0.0002, + "loss": 1.3512, + "step": 3340 + }, + { + "epoch": 4.361979166666667, + "grad_norm": 0.8654456734657288, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 3350 + }, + { + "epoch": 4.375, + "grad_norm": 0.8093366622924805, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3360 + }, + { + "epoch": 4.388020833333333, + "grad_norm": 0.8738575577735901, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 3370 + }, + { + "epoch": 4.401041666666667, + "grad_norm": 0.8923026919364929, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3380 + }, + { + "epoch": 4.4140625, + "grad_norm": 0.8508910536766052, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 3390 + }, + { + "epoch": 4.427083333333333, + "grad_norm": 0.8262084722518921, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 3400 + }, + { + "epoch": 4.440104166666667, + "grad_norm": 0.7843561768531799, + "learning_rate": 0.0002, + "loss": 1.3145, + "step": 3410 + }, + { + "epoch": 4.453125, + "grad_norm": 0.9087795615196228, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3420 + }, + { + "epoch": 4.466145833333333, + "grad_norm": 0.8278809189796448, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3430 + }, + { + "epoch": 4.479166666666667, + "grad_norm": 0.8337010741233826, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 3440 + }, + { + "epoch": 4.4921875, + "grad_norm": 0.7790088057518005, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 3450 + }, + { + "epoch": 4.505208333333333, + "grad_norm": 0.826231837272644, + "learning_rate": 0.0002, + "loss": 1.3344, + "step": 3460 + }, + { + "epoch": 4.518229166666667, + "grad_norm": 0.761461079120636, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 3470 + }, + { + "epoch": 4.53125, + "grad_norm": 0.8892785906791687, + "learning_rate": 0.0002, + "loss": 1.2829, + "step": 3480 + }, + { + "epoch": 4.544270833333333, + "grad_norm": 0.6087225675582886, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 3490 + }, + { + "epoch": 4.557291666666667, + "grad_norm": 0.8259274363517761, + "learning_rate": 0.0002, + "loss": 1.3167, + "step": 3500 + }, + { + "epoch": 4.5703125, + "grad_norm": 0.821164071559906, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3510 + }, + { + "epoch": 4.583333333333333, + "grad_norm": 0.7262887954711914, + "learning_rate": 0.0002, + "loss": 1.2853, + "step": 3520 + }, + { + "epoch": 4.596354166666667, + "grad_norm": 0.8564826250076294, + "learning_rate": 0.0002, + "loss": 1.3777, + "step": 3530 + }, + { + "epoch": 4.609375, + "grad_norm": 0.8072929978370667, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3540 + }, + { + "epoch": 4.622395833333333, + "grad_norm": 0.8040832877159119, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 3550 + }, + { + "epoch": 4.635416666666667, + "grad_norm": 0.7268754839897156, + "learning_rate": 0.0002, + "loss": 1.2863, + "step": 3560 + }, + { + "epoch": 4.6484375, + "grad_norm": 0.9985134601593018, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3570 + }, + { + "epoch": 4.661458333333333, + "grad_norm": 0.9826098680496216, + "learning_rate": 0.0002, + "loss": 1.3221, + "step": 3580 + }, + { + "epoch": 4.674479166666667, + "grad_norm": 0.8794422149658203, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 3590 + }, + { + "epoch": 4.6875, + "grad_norm": 0.7207489609718323, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3600 + }, + { + "epoch": 4.700520833333333, + "grad_norm": 0.7546059489250183, + "learning_rate": 0.0002, + "loss": 1.3192, + "step": 3610 + }, + { + "epoch": 4.713541666666667, + "grad_norm": 0.8318526148796082, + "learning_rate": 0.0002, + "loss": 1.3445, + "step": 3620 + }, + { + "epoch": 4.7265625, + "grad_norm": 0.7529309391975403, + "learning_rate": 0.0002, + "loss": 1.3847, + "step": 3630 + }, + { + "epoch": 4.739583333333333, + "grad_norm": 0.7762532234191895, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3640 + }, + { + "epoch": 4.752604166666667, + "grad_norm": 0.9306083917617798, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 3650 + }, + { + "epoch": 4.765625, + "grad_norm": 0.8050256967544556, + "learning_rate": 0.0002, + "loss": 1.3828, + "step": 3660 + }, + { + "epoch": 4.778645833333333, + "grad_norm": 0.8114449381828308, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3670 + }, + { + "epoch": 4.791666666666667, + "grad_norm": 0.8125811815261841, + "learning_rate": 0.0002, + "loss": 1.3296, + "step": 3680 + }, + { + "epoch": 4.8046875, + "grad_norm": 0.7642565369606018, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 3690 + }, + { + "epoch": 4.817708333333333, + "grad_norm": 0.8970131874084473, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3700 + }, + { + "epoch": 4.830729166666667, + "grad_norm": 0.7654327154159546, + "learning_rate": 0.0002, + "loss": 1.3983, + "step": 3710 + }, + { + "epoch": 4.84375, + "grad_norm": 0.7605378031730652, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 3720 + }, + { + "epoch": 4.856770833333333, + "grad_norm": 0.8340551257133484, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 3730 + }, + { + "epoch": 4.869791666666667, + "grad_norm": 0.7273691296577454, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 3740 + }, + { + "epoch": 4.8828125, + "grad_norm": 0.9718272686004639, + "learning_rate": 0.0002, + "loss": 1.3094, + "step": 3750 + }, + { + "epoch": 4.895833333333333, + "grad_norm": 0.7891847491264343, + "learning_rate": 0.0002, + "loss": 1.296, + "step": 3760 + }, + { + "epoch": 4.908854166666667, + "grad_norm": 0.9090818166732788, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 3770 + }, + { + "epoch": 4.921875, + "grad_norm": 0.7963318824768066, + "learning_rate": 0.0002, + "loss": 1.3478, + "step": 3780 + }, + { + "epoch": 4.934895833333333, + "grad_norm": 0.7588343620300293, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 3790 + }, + { + "epoch": 4.947916666666667, + "grad_norm": 0.84076327085495, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3800 + }, + { + "epoch": 4.9609375, + "grad_norm": 0.7767227292060852, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 3810 + }, + { + "epoch": 4.973958333333333, + "grad_norm": 0.8101866245269775, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3820 + }, + { + "epoch": 4.986979166666667, + "grad_norm": 0.7808696627616882, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 3830 + }, + { + "epoch": 5.0, + "grad_norm": 0.9609483480453491, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 3840 + }, + { + "epoch": 5.0, + "eval_loss": 1.9610719680786133, + "eval_runtime": 87.6572, + "eval_samples_per_second": 5.875, + "eval_steps_per_second": 0.742, + "step": 3840 + }, + { + "epoch": 5.013020833333333, + "grad_norm": 0.9366803765296936, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 3850 + }, + { + "epoch": 5.026041666666667, + "grad_norm": 0.8014302849769592, + "learning_rate": 0.0002, + "loss": 1.1931, + "step": 3860 + }, + { + "epoch": 5.0390625, + "grad_norm": 0.977936863899231, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 3870 + }, + { + "epoch": 5.052083333333333, + "grad_norm": 1.045047640800476, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 3880 + }, + { + "epoch": 5.065104166666667, + "grad_norm": 1.125620722770691, + "learning_rate": 0.0002, + "loss": 1.1709, + "step": 3890 + }, + { + "epoch": 5.078125, + "grad_norm": 1.1565124988555908, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 3900 + }, + { + "epoch": 5.091145833333333, + "grad_norm": 1.102354884147644, + "learning_rate": 0.0002, + "loss": 1.1753, + "step": 3910 + }, + { + "epoch": 5.104166666666667, + "grad_norm": 0.9567629098892212, + "learning_rate": 0.0002, + "loss": 1.1632, + "step": 3920 + }, + { + "epoch": 5.1171875, + "grad_norm": 0.9760252833366394, + "learning_rate": 0.0002, + "loss": 1.1875, + "step": 3930 + }, + { + "epoch": 5.130208333333333, + "grad_norm": 1.026168704032898, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 3940 + }, + { + "epoch": 5.143229166666667, + "grad_norm": 1.1490436792373657, + "learning_rate": 0.0002, + "loss": 1.1598, + "step": 3950 + }, + { + "epoch": 5.15625, + "grad_norm": 0.9712087512016296, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 3960 + }, + { + "epoch": 5.169270833333333, + "grad_norm": 1.0095003843307495, + "learning_rate": 0.0002, + "loss": 1.1948, + "step": 3970 + }, + { + "epoch": 5.182291666666667, + "grad_norm": 0.9171855449676514, + "learning_rate": 0.0002, + "loss": 1.1617, + "step": 3980 + }, + { + "epoch": 5.1953125, + "grad_norm": 1.0105657577514648, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 3990 + }, + { + "epoch": 5.208333333333333, + "grad_norm": 1.0330145359039307, + "learning_rate": 0.0002, + "loss": 1.2098, + "step": 4000 + }, + { + "epoch": 5.221354166666667, + "grad_norm": 1.0676906108856201, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4010 + }, + { + "epoch": 5.234375, + "grad_norm": 1.055088758468628, + "learning_rate": 0.0002, + "loss": 1.1392, + "step": 4020 + }, + { + "epoch": 5.247395833333333, + "grad_norm": 0.9523683786392212, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 4030 + }, + { + "epoch": 5.260416666666667, + "grad_norm": 0.9013799428939819, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 4040 + }, + { + "epoch": 5.2734375, + "grad_norm": 0.9379037618637085, + "learning_rate": 0.0002, + "loss": 1.2274, + "step": 4050 + }, + { + "epoch": 5.286458333333333, + "grad_norm": 0.9565327763557434, + "learning_rate": 0.0002, + "loss": 1.1246, + "step": 4060 + }, + { + "epoch": 5.299479166666667, + "grad_norm": 1.1994404792785645, + "learning_rate": 0.0002, + "loss": 1.2103, + "step": 4070 + }, + { + "epoch": 5.3125, + "grad_norm": 1.0563262701034546, + "learning_rate": 0.0002, + "loss": 1.2016, + "step": 4080 + }, + { + "epoch": 5.325520833333333, + "grad_norm": 1.024290680885315, + "learning_rate": 0.0002, + "loss": 1.2478, + "step": 4090 + }, + { + "epoch": 5.338541666666667, + "grad_norm": 1.0022907257080078, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 4100 + }, + { + "epoch": 5.3515625, + "grad_norm": 0.9642180800437927, + "learning_rate": 0.0002, + "loss": 1.1948, + "step": 4110 + }, + { + "epoch": 5.364583333333333, + "grad_norm": 1.0228009223937988, + "learning_rate": 0.0002, + "loss": 1.231, + "step": 4120 + }, + { + "epoch": 5.377604166666667, + "grad_norm": 1.0379719734191895, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4130 + }, + { + "epoch": 5.390625, + "grad_norm": 1.147053599357605, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 4140 + }, + { + "epoch": 5.403645833333333, + "grad_norm": 1.2097876071929932, + "learning_rate": 0.0002, + "loss": 1.2026, + "step": 4150 + }, + { + "epoch": 5.416666666666667, + "grad_norm": 1.0852497816085815, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 4160 + }, + { + "epoch": 5.4296875, + "grad_norm": 0.9765135645866394, + "learning_rate": 0.0002, + "loss": 1.2182, + "step": 4170 + }, + { + "epoch": 5.442708333333333, + "grad_norm": 1.0180606842041016, + "learning_rate": 0.0002, + "loss": 1.3117, + "step": 4180 + }, + { + "epoch": 5.455729166666667, + "grad_norm": 1.185409665107727, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 4190 + }, + { + "epoch": 5.46875, + "grad_norm": 0.9363358020782471, + "learning_rate": 0.0002, + "loss": 1.1531, + "step": 4200 + }, + { + "epoch": 5.481770833333333, + "grad_norm": 1.0761215686798096, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 4210 + }, + { + "epoch": 5.494791666666667, + "grad_norm": 1.057626724243164, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 4220 + }, + { + "epoch": 5.5078125, + "grad_norm": 1.0103157758712769, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4230 + }, + { + "epoch": 5.520833333333333, + "grad_norm": 1.1056627035140991, + "learning_rate": 0.0002, + "loss": 1.2193, + "step": 4240 + }, + { + "epoch": 5.533854166666667, + "grad_norm": 1.0256257057189941, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 4250 + }, + { + "epoch": 5.546875, + "grad_norm": 1.2814106941223145, + "learning_rate": 0.0002, + "loss": 1.1724, + "step": 4260 + }, + { + "epoch": 5.559895833333333, + "grad_norm": 0.9044927954673767, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 4270 + }, + { + "epoch": 5.572916666666667, + "grad_norm": 0.9870165586471558, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 4280 + }, + { + "epoch": 5.5859375, + "grad_norm": 0.9867369532585144, + "learning_rate": 0.0002, + "loss": 1.2414, + "step": 4290 + }, + { + "epoch": 5.598958333333333, + "grad_norm": 1.045625925064087, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4300 + }, + { + "epoch": 5.611979166666667, + "grad_norm": 0.979853630065918, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 4310 + }, + { + "epoch": 5.625, + "grad_norm": 1.029212236404419, + "learning_rate": 0.0002, + "loss": 1.1629, + "step": 4320 + }, + { + "epoch": 5.638020833333333, + "grad_norm": 1.0348633527755737, + "learning_rate": 0.0002, + "loss": 1.1985, + "step": 4330 + }, + { + "epoch": 5.651041666666667, + "grad_norm": 1.0055185556411743, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 4340 + }, + { + "epoch": 5.6640625, + "grad_norm": 0.9312447309494019, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 4350 + }, + { + "epoch": 5.677083333333333, + "grad_norm": 1.1411694288253784, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 4360 + }, + { + "epoch": 5.690104166666667, + "grad_norm": 0.9764434695243835, + "learning_rate": 0.0002, + "loss": 1.2679, + "step": 4370 + }, + { + "epoch": 5.703125, + "grad_norm": 1.079154133796692, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 4380 + }, + { + "epoch": 5.716145833333333, + "grad_norm": 0.999526858329773, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 4390 + }, + { + "epoch": 5.729166666666667, + "grad_norm": 1.1239734888076782, + "learning_rate": 0.0002, + "loss": 1.1685, + "step": 4400 + }, + { + "epoch": 5.7421875, + "grad_norm": 1.0539512634277344, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 4410 + }, + { + "epoch": 5.755208333333333, + "grad_norm": 0.9884052872657776, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4420 + }, + { + "epoch": 5.768229166666667, + "grad_norm": 0.9821958541870117, + "learning_rate": 0.0002, + "loss": 1.1781, + "step": 4430 + }, + { + "epoch": 5.78125, + "grad_norm": 0.9340839982032776, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 4440 + }, + { + "epoch": 5.794270833333333, + "grad_norm": 0.9935781955718994, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 4450 + }, + { + "epoch": 5.807291666666667, + "grad_norm": 1.1027121543884277, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 4460 + }, + { + "epoch": 5.8203125, + "grad_norm": 0.9388337135314941, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 4470 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 1.0957310199737549, + "learning_rate": 0.0002, + "loss": 1.259, + "step": 4480 + }, + { + "epoch": 5.846354166666667, + "grad_norm": 1.0832754373550415, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 4490 + }, + { + "epoch": 5.859375, + "grad_norm": 0.9498379826545715, + "learning_rate": 0.0002, + "loss": 1.1724, + "step": 4500 + }, + { + "epoch": 5.872395833333333, + "grad_norm": 0.9104725122451782, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 4510 + }, + { + "epoch": 5.885416666666667, + "grad_norm": 1.2238177061080933, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 4520 + }, + { + "epoch": 5.8984375, + "grad_norm": 1.0549527406692505, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 4530 + }, + { + "epoch": 5.911458333333333, + "grad_norm": 1.0415066480636597, + "learning_rate": 0.0002, + "loss": 1.3086, + "step": 4540 + }, + { + "epoch": 5.924479166666667, + "grad_norm": 0.9098646640777588, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 4550 + }, + { + "epoch": 5.9375, + "grad_norm": 0.9182857275009155, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 4560 + }, + { + "epoch": 5.950520833333333, + "grad_norm": 1.088038444519043, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4570 + }, + { + "epoch": 5.963541666666667, + "grad_norm": 1.1331020593643188, + "learning_rate": 0.0002, + "loss": 1.2317, + "step": 4580 + }, + { + "epoch": 5.9765625, + "grad_norm": 0.9592235088348389, + "learning_rate": 0.0002, + "loss": 1.2318, + "step": 4590 + }, + { + "epoch": 5.989583333333333, + "grad_norm": 1.0126368999481201, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 4600 + }, + { + "epoch": 6.0, + "eval_loss": 2.096651315689087, + "eval_runtime": 43.1936, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 1.505, + "step": 4608 + }, + { + "epoch": 6.002604166666667, + "grad_norm": 1.0549334287643433, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 4610 + }, + { + "epoch": 6.015625, + "grad_norm": 1.099247694015503, + "learning_rate": 0.0002, + "loss": 1.0046, + "step": 4620 + }, + { + "epoch": 6.028645833333333, + "grad_norm": 1.0992592573165894, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 4630 + }, + { + "epoch": 6.041666666666667, + "grad_norm": 1.139350414276123, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 4640 + }, + { + "epoch": 6.0546875, + "grad_norm": 1.1316219568252563, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 4650 + }, + { + "epoch": 6.067708333333333, + "grad_norm": 1.5254799127578735, + "learning_rate": 0.0002, + "loss": 1.05, + "step": 4660 + }, + { + "epoch": 6.080729166666667, + "grad_norm": 1.155513048171997, + "learning_rate": 0.0002, + "loss": 1.0357, + "step": 4670 + }, + { + "epoch": 6.09375, + "grad_norm": 1.311339259147644, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 4680 + }, + { + "epoch": 6.106770833333333, + "grad_norm": 0.9942600131034851, + "learning_rate": 0.0002, + "loss": 1.098, + "step": 4690 + }, + { + "epoch": 6.119791666666667, + "grad_norm": 1.388214111328125, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 4700 + }, + { + "epoch": 6.1328125, + "grad_norm": 1.260488510131836, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 4710 + }, + { + "epoch": 6.145833333333333, + "grad_norm": 1.231615662574768, + "learning_rate": 0.0002, + "loss": 1.0225, + "step": 4720 + }, + { + "epoch": 6.158854166666667, + "grad_norm": 1.049696922302246, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 4730 + }, + { + "epoch": 6.171875, + "grad_norm": 1.145426869392395, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 4740 + }, + { + "epoch": 6.184895833333333, + "grad_norm": 1.1715868711471558, + "learning_rate": 0.0002, + "loss": 1.0751, + "step": 4750 + }, + { + "epoch": 6.197916666666667, + "grad_norm": 1.2575212717056274, + "learning_rate": 0.0002, + "loss": 0.9901, + "step": 4760 + }, + { + "epoch": 6.2109375, + "grad_norm": 1.2996530532836914, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 4770 + }, + { + "epoch": 6.223958333333333, + "grad_norm": 1.4030718803405762, + "learning_rate": 0.0002, + "loss": 1.0227, + "step": 4780 + }, + { + "epoch": 6.236979166666667, + "grad_norm": 1.2140913009643555, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 4790 + }, + { + "epoch": 6.25, + "grad_norm": 1.3512893915176392, + "learning_rate": 0.0002, + "loss": 1.0637, + "step": 4800 + }, + { + "epoch": 6.263020833333333, + "grad_norm": 1.1931439638137817, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 4810 + }, + { + "epoch": 6.276041666666667, + "grad_norm": 1.0379345417022705, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 4820 + }, + { + "epoch": 6.2890625, + "grad_norm": 1.1571568250656128, + "learning_rate": 0.0002, + "loss": 1.0954, + "step": 4830 + }, + { + "epoch": 6.302083333333333, + "grad_norm": 1.0717264413833618, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 4840 + }, + { + "epoch": 6.315104166666667, + "grad_norm": 1.360496997833252, + "learning_rate": 0.0002, + "loss": 1.0466, + "step": 4850 + }, + { + "epoch": 6.328125, + "grad_norm": 1.0864052772521973, + "learning_rate": 0.0002, + "loss": 1.001, + "step": 4860 + }, + { + "epoch": 6.341145833333333, + "grad_norm": 1.3391871452331543, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 4870 + }, + { + "epoch": 6.354166666666667, + "grad_norm": 1.2568541765213013, + "learning_rate": 0.0002, + "loss": 1.0797, + "step": 4880 + }, + { + "epoch": 6.3671875, + "grad_norm": 1.255483627319336, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 4890 + }, + { + "epoch": 6.380208333333333, + "grad_norm": 1.173972487449646, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 4900 + }, + { + "epoch": 6.393229166666667, + "grad_norm": 1.14010488986969, + "learning_rate": 0.0002, + "loss": 1.0238, + "step": 4910 + }, + { + "epoch": 6.40625, + "grad_norm": 1.1317493915557861, + "learning_rate": 0.0002, + "loss": 1.0319, + "step": 4920 + }, + { + "epoch": 6.419270833333333, + "grad_norm": 1.1547486782073975, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 4930 + }, + { + "epoch": 6.432291666666667, + "grad_norm": 1.1822998523712158, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 4940 + }, + { + "epoch": 6.4453125, + "grad_norm": 1.1865756511688232, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 4950 + }, + { + "epoch": 6.458333333333333, + "grad_norm": 1.13661789894104, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 4960 + }, + { + "epoch": 6.471354166666667, + "grad_norm": 1.047326683998108, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 4970 + }, + { + "epoch": 6.484375, + "grad_norm": 1.3550827503204346, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 4980 + }, + { + "epoch": 6.497395833333333, + "grad_norm": 1.2868435382843018, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 4990 + }, + { + "epoch": 6.510416666666667, + "grad_norm": 1.4678666591644287, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 5000 + }, + { + "epoch": 6.5234375, + "grad_norm": 1.3739159107208252, + "learning_rate": 0.0002, + "loss": 1.076, + "step": 5010 + }, + { + "epoch": 6.536458333333333, + "grad_norm": 1.213034987449646, + "learning_rate": 0.0002, + "loss": 1.046, + "step": 5020 + }, + { + "epoch": 6.549479166666667, + "grad_norm": 1.5025049448013306, + "learning_rate": 0.0002, + "loss": 1.1129, + "step": 5030 + }, + { + "epoch": 6.5625, + "grad_norm": 1.1811821460723877, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 5040 + }, + { + "epoch": 6.575520833333333, + "grad_norm": 1.2845960855484009, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 5050 + }, + { + "epoch": 6.588541666666667, + "grad_norm": 1.0641103982925415, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 5060 + }, + { + "epoch": 6.6015625, + "grad_norm": 1.0967134237289429, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 5070 + }, + { + "epoch": 6.614583333333333, + "grad_norm": 1.1802116632461548, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 5080 + }, + { + "epoch": 6.627604166666667, + "grad_norm": 1.3110308647155762, + "learning_rate": 0.0002, + "loss": 1.0296, + "step": 5090 + }, + { + "epoch": 6.640625, + "grad_norm": 1.1863301992416382, + "learning_rate": 0.0002, + "loss": 1.0273, + "step": 5100 + }, + { + "epoch": 6.653645833333333, + "grad_norm": 1.0931109189987183, + "learning_rate": 0.0002, + "loss": 1.1355, + "step": 5110 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0571614503860474, + "learning_rate": 0.0002, + "loss": 1.1025, + "step": 5120 + }, + { + "epoch": 6.6796875, + "grad_norm": 1.2855656147003174, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 5130 + }, + { + "epoch": 6.692708333333333, + "grad_norm": 1.2217806577682495, + "learning_rate": 0.0002, + "loss": 1.0582, + "step": 5140 + }, + { + "epoch": 6.705729166666667, + "grad_norm": 1.093658447265625, + "learning_rate": 0.0002, + "loss": 1.1098, + "step": 5150 + }, + { + "epoch": 6.71875, + "grad_norm": 1.2592076063156128, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 5160 + }, + { + "epoch": 6.731770833333333, + "grad_norm": 1.0720105171203613, + "learning_rate": 0.0002, + "loss": 1.0381, + "step": 5170 + }, + { + "epoch": 6.744791666666667, + "grad_norm": 1.178058385848999, + "learning_rate": 0.0002, + "loss": 1.0707, + "step": 5180 + }, + { + "epoch": 6.7578125, + "grad_norm": 1.1897447109222412, + "learning_rate": 0.0002, + "loss": 1.116, + "step": 5190 + }, + { + "epoch": 6.770833333333333, + "grad_norm": 1.3547686338424683, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 5200 + }, + { + "epoch": 6.783854166666667, + "grad_norm": 1.2514727115631104, + "learning_rate": 0.0002, + "loss": 1.0642, + "step": 5210 + }, + { + "epoch": 6.796875, + "grad_norm": 1.5253846645355225, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 5220 + }, + { + "epoch": 6.809895833333333, + "grad_norm": 1.090774655342102, + "learning_rate": 0.0002, + "loss": 1.0426, + "step": 5230 + }, + { + "epoch": 6.822916666666667, + "grad_norm": 1.1387991905212402, + "learning_rate": 0.0002, + "loss": 1.0867, + "step": 5240 + }, + { + "epoch": 6.8359375, + "grad_norm": 1.102423906326294, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 5250 + }, + { + "epoch": 6.848958333333333, + "grad_norm": 1.2453415393829346, + "learning_rate": 0.0002, + "loss": 1.0976, + "step": 5260 + }, + { + "epoch": 6.861979166666667, + "grad_norm": 1.2541141510009766, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 5270 + }, + { + "epoch": 6.875, + "grad_norm": 1.2719744443893433, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 5280 + }, + { + "epoch": 6.888020833333333, + "grad_norm": 1.085763931274414, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 5290 + }, + { + "epoch": 6.901041666666667, + "grad_norm": 1.2399879693984985, + "learning_rate": 0.0002, + "loss": 1.1306, + "step": 5300 + }, + { + "epoch": 6.9140625, + "grad_norm": 1.244888424873352, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 5310 + }, + { + "epoch": 6.927083333333333, + "grad_norm": 1.1424126625061035, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 5320 + }, + { + "epoch": 6.940104166666667, + "grad_norm": 1.1804956197738647, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 5330 + }, + { + "epoch": 6.953125, + "grad_norm": 1.3943406343460083, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 5340 + }, + { + "epoch": 6.966145833333333, + "grad_norm": 1.3278584480285645, + "learning_rate": 0.0002, + "loss": 1.0573, + "step": 5350 + }, + { + "epoch": 6.979166666666667, + "grad_norm": 1.3579362630844116, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 5360 + }, + { + "epoch": 6.9921875, + "grad_norm": 1.2172175645828247, + "learning_rate": 0.0002, + "loss": 1.059, + "step": 5370 + }, + { + "epoch": 7.0, + "eval_loss": 2.200756549835205, + "eval_runtime": 42.8258, + "eval_samples_per_second": 12.025, + "eval_steps_per_second": 1.518, + "step": 5376 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.487892426484613e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-5376/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..16e449b983f5dad6a837d1852cae57a9f46468f0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b435874d6d972600ec8b71f7f8fc0ffbc1dd5c246986cda1c43d4eea85cf68b +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0e77455167e1e96996aeea167218ef86ca864ec7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147799e96106e7673ec72ec8e58d566c57c4f7ace8ea38ae35b60eeb21ed0574 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4cabcb7bd7d5fe0f13fe4995865bdb063c01c95e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39cf580af0fef7815a824a79b470657f7162784c8ffd714f8d708b229e0239c2 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5cb69ddf29ea50af62b287a84dd0eee2c022cb78 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e26a0258952b2cf4d55cfaf38b3b58babce56dd186c0cfac1c7cf33406881a03 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af6eed7c4755e3ee4d539270abd4276f83329bdd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/trainer_state.json @@ -0,0 +1,4395 @@ +{ + "best_metric": 1.820037841796875, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 6144, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 0.303970068693161, + "learning_rate": 0.0002, + "loss": 1.8072, + "step": 770 + }, + { + "epoch": 1.015625, + "grad_norm": 0.32745876908302307, + "learning_rate": 0.0002, + "loss": 1.6708, + "step": 780 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 0.33467888832092285, + "learning_rate": 0.0002, + "loss": 1.623, + "step": 790 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.38253068923950195, + "learning_rate": 0.0002, + "loss": 1.746, + "step": 800 + }, + { + "epoch": 1.0546875, + "grad_norm": 0.3955802023410797, + "learning_rate": 0.0002, + "loss": 1.685, + "step": 810 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 0.3534117043018341, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 820 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 0.33427858352661133, + "learning_rate": 0.0002, + "loss": 1.6361, + "step": 830 + }, + { + "epoch": 1.09375, + "grad_norm": 0.35261571407318115, + "learning_rate": 0.0002, + "loss": 1.7435, + "step": 840 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 0.4416263997554779, + "learning_rate": 0.0002, + "loss": 1.7112, + "step": 850 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 0.3918050229549408, + "learning_rate": 0.0002, + "loss": 1.6311, + "step": 860 + }, + { + "epoch": 1.1328125, + "grad_norm": 0.38482677936553955, + "learning_rate": 0.0002, + "loss": 1.6804, + "step": 870 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.4945143759250641, + "learning_rate": 0.0002, + "loss": 1.6951, + "step": 880 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 0.429677814245224, + "learning_rate": 0.0002, + "loss": 1.7577, + "step": 890 + }, + { + "epoch": 1.171875, + "grad_norm": 0.41878288984298706, + "learning_rate": 0.0002, + "loss": 1.7204, + "step": 900 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 0.41578373312950134, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 910 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 0.37028902769088745, + "learning_rate": 0.0002, + "loss": 1.7017, + "step": 920 + }, + { + "epoch": 1.2109375, + "grad_norm": 0.3824995756149292, + "learning_rate": 0.0002, + "loss": 1.7074, + "step": 930 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 0.3818865418434143, + "learning_rate": 0.0002, + "loss": 1.6185, + "step": 940 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 0.3930460810661316, + "learning_rate": 0.0002, + "loss": 1.7894, + "step": 950 + }, + { + "epoch": 1.25, + "grad_norm": 0.3904426395893097, + "learning_rate": 0.0002, + "loss": 1.6766, + "step": 960 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 0.4175802171230316, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 970 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 0.42343786358833313, + "learning_rate": 0.0002, + "loss": 1.7556, + "step": 980 + }, + { + "epoch": 1.2890625, + "grad_norm": 0.4168420135974884, + "learning_rate": 0.0002, + "loss": 1.6339, + "step": 990 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 0.38692983984947205, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1000 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 0.5037692189216614, + "learning_rate": 0.0002, + "loss": 1.6384, + "step": 1010 + }, + { + "epoch": 1.328125, + "grad_norm": 0.39436691999435425, + "learning_rate": 0.0002, + "loss": 1.6878, + "step": 1020 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 0.3431943356990814, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 1030 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.39167070388793945, + "learning_rate": 0.0002, + "loss": 1.7034, + "step": 1040 + }, + { + "epoch": 1.3671875, + "grad_norm": 0.3820446729660034, + "learning_rate": 0.0002, + "loss": 1.7108, + "step": 1050 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 0.4190749526023865, + "learning_rate": 0.0002, + "loss": 1.7885, + "step": 1060 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 0.3618869185447693, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1070 + }, + { + "epoch": 1.40625, + "grad_norm": 0.38852423429489136, + "learning_rate": 0.0002, + "loss": 1.6199, + "step": 1080 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 0.49829256534576416, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 1090 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 0.3956700563430786, + "learning_rate": 0.0002, + "loss": 1.6589, + "step": 1100 + }, + { + "epoch": 1.4453125, + "grad_norm": 0.38829147815704346, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 1110 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.37237483263015747, + "learning_rate": 0.0002, + "loss": 1.6709, + "step": 1120 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 0.39798808097839355, + "learning_rate": 0.0002, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 1.484375, + "grad_norm": 0.38188642263412476, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 1140 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 0.44961944222450256, + "learning_rate": 0.0002, + "loss": 1.6707, + "step": 1150 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 0.3816550374031067, + "learning_rate": 0.0002, + "loss": 1.6241, + "step": 1160 + }, + { + "epoch": 1.5234375, + "grad_norm": 0.3885478973388672, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 1170 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 0.42779695987701416, + "learning_rate": 0.0002, + "loss": 1.7285, + "step": 1180 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 0.41499748826026917, + "learning_rate": 0.0002, + "loss": 1.7399, + "step": 1190 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4319412410259247, + "learning_rate": 0.0002, + "loss": 1.6569, + "step": 1200 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 0.38847389817237854, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 1210 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 0.45832890272140503, + "learning_rate": 0.0002, + "loss": 1.6666, + "step": 1220 + }, + { + "epoch": 1.6015625, + "grad_norm": 0.45928797125816345, + "learning_rate": 0.0002, + "loss": 1.68, + "step": 1230 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 0.4052276611328125, + "learning_rate": 0.0002, + "loss": 1.7225, + "step": 1240 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 0.4031650424003601, + "learning_rate": 0.0002, + "loss": 1.6722, + "step": 1250 + }, + { + "epoch": 1.640625, + "grad_norm": 0.36724114418029785, + "learning_rate": 0.0002, + "loss": 1.7243, + "step": 1260 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 0.4188505709171295, + "learning_rate": 0.0002, + "loss": 1.7672, + "step": 1270 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3982168138027191, + "learning_rate": 0.0002, + "loss": 1.7685, + "step": 1280 + }, + { + "epoch": 1.6796875, + "grad_norm": 0.3768596053123474, + "learning_rate": 0.0002, + "loss": 1.6831, + "step": 1290 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 0.3843287527561188, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1300 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 0.3982345461845398, + "learning_rate": 0.0002, + "loss": 1.6188, + "step": 1310 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3407546281814575, + "learning_rate": 0.0002, + "loss": 1.7084, + "step": 1320 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 0.36327359080314636, + "learning_rate": 0.0002, + "loss": 1.7316, + "step": 1330 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 0.4141675531864166, + "learning_rate": 0.0002, + "loss": 1.734, + "step": 1340 + }, + { + "epoch": 1.7578125, + "grad_norm": 0.43894267082214355, + "learning_rate": 0.0002, + "loss": 1.7257, + "step": 1350 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.40564292669296265, + "learning_rate": 0.0002, + "loss": 1.6613, + "step": 1360 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 0.3978462815284729, + "learning_rate": 0.0002, + "loss": 1.6841, + "step": 1370 + }, + { + "epoch": 1.796875, + "grad_norm": 0.37140771746635437, + "learning_rate": 0.0002, + "loss": 1.6497, + "step": 1380 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 0.43164145946502686, + "learning_rate": 0.0002, + "loss": 1.742, + "step": 1390 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 0.38034674525260925, + "learning_rate": 0.0002, + "loss": 1.7253, + "step": 1400 + }, + { + "epoch": 1.8359375, + "grad_norm": 0.4235687851905823, + "learning_rate": 0.0002, + "loss": 1.652, + "step": 1410 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 0.37417489290237427, + "learning_rate": 0.0002, + "loss": 1.752, + "step": 1420 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 0.4303789734840393, + "learning_rate": 0.0002, + "loss": 1.6995, + "step": 1430 + }, + { + "epoch": 1.875, + "grad_norm": 0.43942129611968994, + "learning_rate": 0.0002, + "loss": 1.6489, + "step": 1440 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 0.3866581320762634, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 1450 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 0.3686903417110443, + "learning_rate": 0.0002, + "loss": 1.72, + "step": 1460 + }, + { + "epoch": 1.9140625, + "grad_norm": 0.3885461986064911, + "learning_rate": 0.0002, + "loss": 1.6545, + "step": 1470 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 0.4156927466392517, + "learning_rate": 0.0002, + "loss": 1.6981, + "step": 1480 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 0.3934236168861389, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 1490 + }, + { + "epoch": 1.953125, + "grad_norm": 0.38645586371421814, + "learning_rate": 0.0002, + "loss": 1.7384, + "step": 1500 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 0.43272635340690613, + "learning_rate": 0.0002, + "loss": 1.7033, + "step": 1510 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.42476025223731995, + "learning_rate": 0.0002, + "loss": 1.6138, + "step": 1520 + }, + { + "epoch": 1.9921875, + "grad_norm": 0.37216147780418396, + "learning_rate": 0.0002, + "loss": 1.5834, + "step": 1530 + }, + { + "epoch": 2.0, + "eval_loss": 1.820037841796875, + "eval_runtime": 101.0456, + "eval_samples_per_second": 5.097, + "eval_steps_per_second": 0.643, + "step": 1536 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 0.39003029465675354, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 1540 + }, + { + "epoch": 2.0182291666666665, + "grad_norm": 0.4302637577056885, + "learning_rate": 0.0002, + "loss": 1.5447, + "step": 1550 + }, + { + "epoch": 2.03125, + "grad_norm": 0.4496043026447296, + "learning_rate": 0.0002, + "loss": 1.5951, + "step": 1560 + }, + { + "epoch": 2.0442708333333335, + "grad_norm": 0.42824679613113403, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 1570 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 0.44775739312171936, + "learning_rate": 0.0002, + "loss": 1.5996, + "step": 1580 + }, + { + "epoch": 2.0703125, + "grad_norm": 0.4705299735069275, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1590 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.4614814817905426, + "learning_rate": 0.0002, + "loss": 1.7589, + "step": 1600 + }, + { + "epoch": 2.0963541666666665, + "grad_norm": 0.45097213983535767, + "learning_rate": 0.0002, + "loss": 1.5762, + "step": 1610 + }, + { + "epoch": 2.109375, + "grad_norm": 0.41954323649406433, + "learning_rate": 0.0002, + "loss": 1.4947, + "step": 1620 + }, + { + "epoch": 2.1223958333333335, + "grad_norm": 0.44894352555274963, + "learning_rate": 0.0002, + "loss": 1.6397, + "step": 1630 + }, + { + "epoch": 2.1354166666666665, + "grad_norm": 0.4421502947807312, + "learning_rate": 0.0002, + "loss": 1.5251, + "step": 1640 + }, + { + "epoch": 2.1484375, + "grad_norm": 0.44649967551231384, + "learning_rate": 0.0002, + "loss": 1.5931, + "step": 1650 + }, + { + "epoch": 2.1614583333333335, + "grad_norm": 0.44216716289520264, + "learning_rate": 0.0002, + "loss": 1.6327, + "step": 1660 + }, + { + "epoch": 2.1744791666666665, + "grad_norm": 0.6363232135772705, + "learning_rate": 0.0002, + "loss": 1.5924, + "step": 1670 + }, + { + "epoch": 2.1875, + "grad_norm": 0.46533334255218506, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 1680 + }, + { + "epoch": 2.2005208333333335, + "grad_norm": 0.48486822843551636, + "learning_rate": 0.0002, + "loss": 1.5539, + "step": 1690 + }, + { + "epoch": 2.2135416666666665, + "grad_norm": 0.43277066946029663, + "learning_rate": 0.0002, + "loss": 1.6322, + "step": 1700 + }, + { + "epoch": 2.2265625, + "grad_norm": 0.45927226543426514, + "learning_rate": 0.0002, + "loss": 1.4979, + "step": 1710 + }, + { + "epoch": 2.2395833333333335, + "grad_norm": 0.4654010236263275, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 1720 + }, + { + "epoch": 2.2526041666666665, + "grad_norm": 0.49796584248542786, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 1730 + }, + { + "epoch": 2.265625, + "grad_norm": 0.4506736397743225, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 1740 + }, + { + "epoch": 2.2786458333333335, + "grad_norm": 0.46757954359054565, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 1750 + }, + { + "epoch": 2.2916666666666665, + "grad_norm": 0.4507335424423218, + "learning_rate": 0.0002, + "loss": 1.6307, + "step": 1760 + }, + { + "epoch": 2.3046875, + "grad_norm": 0.43900197744369507, + "learning_rate": 0.0002, + "loss": 1.5905, + "step": 1770 + }, + { + "epoch": 2.3177083333333335, + "grad_norm": 0.48013004660606384, + "learning_rate": 0.0002, + "loss": 1.6655, + "step": 1780 + }, + { + "epoch": 2.3307291666666665, + "grad_norm": 0.41891220211982727, + "learning_rate": 0.0002, + "loss": 1.6024, + "step": 1790 + }, + { + "epoch": 2.34375, + "grad_norm": 0.4879191219806671, + "learning_rate": 0.0002, + "loss": 1.658, + "step": 1800 + }, + { + "epoch": 2.3567708333333335, + "grad_norm": 0.46148231625556946, + "learning_rate": 0.0002, + "loss": 1.6084, + "step": 1810 + }, + { + "epoch": 2.3697916666666665, + "grad_norm": 0.5114223957061768, + "learning_rate": 0.0002, + "loss": 1.6072, + "step": 1820 + }, + { + "epoch": 2.3828125, + "grad_norm": 0.4828612804412842, + "learning_rate": 0.0002, + "loss": 1.5505, + "step": 1830 + }, + { + "epoch": 2.3958333333333335, + "grad_norm": 0.4672335386276245, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 1840 + }, + { + "epoch": 2.4088541666666665, + "grad_norm": 0.4914792776107788, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 1850 + }, + { + "epoch": 2.421875, + "grad_norm": 0.44478079676628113, + "learning_rate": 0.0002, + "loss": 1.5356, + "step": 1860 + }, + { + "epoch": 2.4348958333333335, + "grad_norm": 0.4601325988769531, + "learning_rate": 0.0002, + "loss": 1.7262, + "step": 1870 + }, + { + "epoch": 2.4479166666666665, + "grad_norm": 0.44539815187454224, + "learning_rate": 0.0002, + "loss": 1.555, + "step": 1880 + }, + { + "epoch": 2.4609375, + "grad_norm": 0.4532422125339508, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 1890 + }, + { + "epoch": 2.4739583333333335, + "grad_norm": 0.5323562622070312, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 1900 + }, + { + "epoch": 2.4869791666666665, + "grad_norm": 0.5027516484260559, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1910 + }, + { + "epoch": 2.5, + "grad_norm": 0.4507808983325958, + "learning_rate": 0.0002, + "loss": 1.5471, + "step": 1920 + }, + { + "epoch": 2.5130208333333335, + "grad_norm": 0.4996422827243805, + "learning_rate": 0.0002, + "loss": 1.613, + "step": 1930 + }, + { + "epoch": 2.5260416666666665, + "grad_norm": 0.4964800179004669, + "learning_rate": 0.0002, + "loss": 1.6412, + "step": 1940 + }, + { + "epoch": 2.5390625, + "grad_norm": 0.48546481132507324, + "learning_rate": 0.0002, + "loss": 1.547, + "step": 1950 + }, + { + "epoch": 2.5520833333333335, + "grad_norm": 0.47357916831970215, + "learning_rate": 0.0002, + "loss": 1.6075, + "step": 1960 + }, + { + "epoch": 2.5651041666666665, + "grad_norm": 0.47136595845222473, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 1970 + }, + { + "epoch": 2.578125, + "grad_norm": 0.5185502171516418, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 1980 + }, + { + "epoch": 2.5911458333333335, + "grad_norm": 0.47995880246162415, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 1990 + }, + { + "epoch": 2.6041666666666665, + "grad_norm": 0.5076674222946167, + "learning_rate": 0.0002, + "loss": 1.638, + "step": 2000 + }, + { + "epoch": 2.6171875, + "grad_norm": 0.4805421233177185, + "learning_rate": 0.0002, + "loss": 1.6038, + "step": 2010 + }, + { + "epoch": 2.6302083333333335, + "grad_norm": 0.4406864047050476, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 2020 + }, + { + "epoch": 2.6432291666666665, + "grad_norm": 0.521388828754425, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2030 + }, + { + "epoch": 2.65625, + "grad_norm": 0.4531918466091156, + "learning_rate": 0.0002, + "loss": 1.5338, + "step": 2040 + }, + { + "epoch": 2.6692708333333335, + "grad_norm": 0.45295774936676025, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 2050 + }, + { + "epoch": 2.6822916666666665, + "grad_norm": 0.4573723375797272, + "learning_rate": 0.0002, + "loss": 1.5252, + "step": 2060 + }, + { + "epoch": 2.6953125, + "grad_norm": 0.4836064279079437, + "learning_rate": 0.0002, + "loss": 1.5765, + "step": 2070 + }, + { + "epoch": 2.7083333333333335, + "grad_norm": 0.5040885210037231, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 2080 + }, + { + "epoch": 2.7213541666666665, + "grad_norm": 0.5153458118438721, + "learning_rate": 0.0002, + "loss": 1.6438, + "step": 2090 + }, + { + "epoch": 2.734375, + "grad_norm": 0.4415692090988159, + "learning_rate": 0.0002, + "loss": 1.5917, + "step": 2100 + }, + { + "epoch": 2.7473958333333335, + "grad_norm": 0.4862712621688843, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 2110 + }, + { + "epoch": 2.7604166666666665, + "grad_norm": 0.4845922589302063, + "learning_rate": 0.0002, + "loss": 1.5797, + "step": 2120 + }, + { + "epoch": 2.7734375, + "grad_norm": 0.5153566598892212, + "learning_rate": 0.0002, + "loss": 1.6404, + "step": 2130 + }, + { + "epoch": 2.7864583333333335, + "grad_norm": 0.4220491945743561, + "learning_rate": 0.0002, + "loss": 1.5609, + "step": 2140 + }, + { + "epoch": 2.7994791666666665, + "grad_norm": 0.523292064666748, + "learning_rate": 0.0002, + "loss": 1.5404, + "step": 2150 + }, + { + "epoch": 2.8125, + "grad_norm": 0.4567972421646118, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2160 + }, + { + "epoch": 2.8255208333333335, + "grad_norm": 0.6252557039260864, + "learning_rate": 0.0002, + "loss": 1.6279, + "step": 2170 + }, + { + "epoch": 2.8385416666666665, + "grad_norm": 0.5231373310089111, + "learning_rate": 0.0002, + "loss": 1.6203, + "step": 2180 + }, + { + "epoch": 2.8515625, + "grad_norm": 0.49243974685668945, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 2190 + }, + { + "epoch": 2.8645833333333335, + "grad_norm": 0.521644115447998, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 2200 + }, + { + "epoch": 2.8776041666666665, + "grad_norm": 0.4624195694923401, + "learning_rate": 0.0002, + "loss": 1.6812, + "step": 2210 + }, + { + "epoch": 2.890625, + "grad_norm": 0.4463620185852051, + "learning_rate": 0.0002, + "loss": 1.6132, + "step": 2220 + }, + { + "epoch": 2.9036458333333335, + "grad_norm": 0.45793524384498596, + "learning_rate": 0.0002, + "loss": 1.6095, + "step": 2230 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.46979188919067383, + "learning_rate": 0.0002, + "loss": 1.5985, + "step": 2240 + }, + { + "epoch": 2.9296875, + "grad_norm": 0.5220303535461426, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 2250 + }, + { + "epoch": 2.9427083333333335, + "grad_norm": 0.44405895471572876, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2260 + }, + { + "epoch": 2.9557291666666665, + "grad_norm": 0.523841381072998, + "learning_rate": 0.0002, + "loss": 1.6685, + "step": 2270 + }, + { + "epoch": 2.96875, + "grad_norm": 0.4928138852119446, + "learning_rate": 0.0002, + "loss": 1.595, + "step": 2280 + }, + { + "epoch": 2.9817708333333335, + "grad_norm": 0.4918071925640106, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 2290 + }, + { + "epoch": 2.9947916666666665, + "grad_norm": 0.4584912061691284, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 2300 + }, + { + "epoch": 3.0, + "eval_loss": 1.8474308252334595, + "eval_runtime": 103.7697, + "eval_samples_per_second": 4.963, + "eval_steps_per_second": 0.626, + "step": 2304 + }, + { + "epoch": 3.0078125, + "grad_norm": 0.4801871180534363, + "learning_rate": 0.0002, + "loss": 1.5454, + "step": 2310 + }, + { + "epoch": 3.0208333333333335, + "grad_norm": 0.5789998173713684, + "learning_rate": 0.0002, + "loss": 1.4019, + "step": 2320 + }, + { + "epoch": 3.0338541666666665, + "grad_norm": 0.49856704473495483, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2330 + }, + { + "epoch": 3.046875, + "grad_norm": 0.5625631213188171, + "learning_rate": 0.0002, + "loss": 1.4718, + "step": 2340 + }, + { + "epoch": 3.0598958333333335, + "grad_norm": 0.557637095451355, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 2350 + }, + { + "epoch": 3.0729166666666665, + "grad_norm": 0.528889536857605, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 2360 + }, + { + "epoch": 3.0859375, + "grad_norm": 0.5952284932136536, + "learning_rate": 0.0002, + "loss": 1.4307, + "step": 2370 + }, + { + "epoch": 3.0989583333333335, + "grad_norm": 0.5549899339675903, + "learning_rate": 0.0002, + "loss": 1.5304, + "step": 2380 + }, + { + "epoch": 3.1119791666666665, + "grad_norm": 0.662139892578125, + "learning_rate": 0.0002, + "loss": 1.5034, + "step": 2390 + }, + { + "epoch": 3.125, + "grad_norm": 0.5281530618667603, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 2400 + }, + { + "epoch": 3.1380208333333335, + "grad_norm": 0.6134106516838074, + "learning_rate": 0.0002, + "loss": 1.4047, + "step": 2410 + }, + { + "epoch": 3.1510416666666665, + "grad_norm": 0.6040887236595154, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2420 + }, + { + "epoch": 3.1640625, + "grad_norm": 0.549672544002533, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 2430 + }, + { + "epoch": 3.1770833333333335, + "grad_norm": 0.9195653796195984, + "learning_rate": 0.0002, + "loss": 1.401, + "step": 2440 + }, + { + "epoch": 3.1901041666666665, + "grad_norm": 0.5578703284263611, + "learning_rate": 0.0002, + "loss": 1.507, + "step": 2450 + }, + { + "epoch": 3.203125, + "grad_norm": 0.5982925891876221, + "learning_rate": 0.0002, + "loss": 1.4873, + "step": 2460 + }, + { + "epoch": 3.2161458333333335, + "grad_norm": 0.5544393062591553, + "learning_rate": 0.0002, + "loss": 1.4909, + "step": 2470 + }, + { + "epoch": 3.2291666666666665, + "grad_norm": 0.6015266180038452, + "learning_rate": 0.0002, + "loss": 1.4705, + "step": 2480 + }, + { + "epoch": 3.2421875, + "grad_norm": 0.5995243191719055, + "learning_rate": 0.0002, + "loss": 1.4652, + "step": 2490 + }, + { + "epoch": 3.2552083333333335, + "grad_norm": 0.5846129059791565, + "learning_rate": 0.0002, + "loss": 1.4486, + "step": 2500 + }, + { + "epoch": 3.2682291666666665, + "grad_norm": 0.5552570223808289, + "learning_rate": 0.0002, + "loss": 1.4529, + "step": 2510 + }, + { + "epoch": 3.28125, + "grad_norm": 0.576998233795166, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 2520 + }, + { + "epoch": 3.2942708333333335, + "grad_norm": 0.6526138186454773, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2530 + }, + { + "epoch": 3.3072916666666665, + "grad_norm": 0.6064265966415405, + "learning_rate": 0.0002, + "loss": 1.474, + "step": 2540 + }, + { + "epoch": 3.3203125, + "grad_norm": 0.5542362928390503, + "learning_rate": 0.0002, + "loss": 1.5125, + "step": 2550 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6048482060432434, + "learning_rate": 0.0002, + "loss": 1.4769, + "step": 2560 + }, + { + "epoch": 3.3463541666666665, + "grad_norm": 0.6328344941139221, + "learning_rate": 0.0002, + "loss": 1.4682, + "step": 2570 + }, + { + "epoch": 3.359375, + "grad_norm": 0.6347311735153198, + "learning_rate": 0.0002, + "loss": 1.5647, + "step": 2580 + }, + { + "epoch": 3.3723958333333335, + "grad_norm": 0.537570595741272, + "learning_rate": 0.0002, + "loss": 1.5752, + "step": 2590 + }, + { + "epoch": 3.3854166666666665, + "grad_norm": 0.5704807639122009, + "learning_rate": 0.0002, + "loss": 1.4086, + "step": 2600 + }, + { + "epoch": 3.3984375, + "grad_norm": 0.5914373993873596, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 2610 + }, + { + "epoch": 3.4114583333333335, + "grad_norm": 0.6724640130996704, + "learning_rate": 0.0002, + "loss": 1.4436, + "step": 2620 + }, + { + "epoch": 3.4244791666666665, + "grad_norm": 0.6295472383499146, + "learning_rate": 0.0002, + "loss": 1.5731, + "step": 2630 + }, + { + "epoch": 3.4375, + "grad_norm": 0.5842770934104919, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 2640 + }, + { + "epoch": 3.4505208333333335, + "grad_norm": 0.6297776699066162, + "learning_rate": 0.0002, + "loss": 1.451, + "step": 2650 + }, + { + "epoch": 3.4635416666666665, + "grad_norm": 0.6105847358703613, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 2660 + }, + { + "epoch": 3.4765625, + "grad_norm": 0.6294940710067749, + "learning_rate": 0.0002, + "loss": 1.5332, + "step": 2670 + }, + { + "epoch": 3.4895833333333335, + "grad_norm": 0.6573333740234375, + "learning_rate": 0.0002, + "loss": 1.5451, + "step": 2680 + }, + { + "epoch": 3.5026041666666665, + "grad_norm": 0.663661539554596, + "learning_rate": 0.0002, + "loss": 1.4592, + "step": 2690 + }, + { + "epoch": 3.515625, + "grad_norm": 0.6729148626327515, + "learning_rate": 0.0002, + "loss": 1.5286, + "step": 2700 + }, + { + "epoch": 3.5286458333333335, + "grad_norm": 0.6633102893829346, + "learning_rate": 0.0002, + "loss": 1.534, + "step": 2710 + }, + { + "epoch": 3.5416666666666665, + "grad_norm": 0.567686915397644, + "learning_rate": 0.0002, + "loss": 1.4023, + "step": 2720 + }, + { + "epoch": 3.5546875, + "grad_norm": 0.6281962394714355, + "learning_rate": 0.0002, + "loss": 1.4925, + "step": 2730 + }, + { + "epoch": 3.5677083333333335, + "grad_norm": 0.5710738897323608, + "learning_rate": 0.0002, + "loss": 1.5028, + "step": 2740 + }, + { + "epoch": 3.5807291666666665, + "grad_norm": 0.648162305355072, + "learning_rate": 0.0002, + "loss": 1.4393, + "step": 2750 + }, + { + "epoch": 3.59375, + "grad_norm": 0.5466254949569702, + "learning_rate": 0.0002, + "loss": 1.4294, + "step": 2760 + }, + { + "epoch": 3.6067708333333335, + "grad_norm": 0.6867973208427429, + "learning_rate": 0.0002, + "loss": 1.4993, + "step": 2770 + }, + { + "epoch": 3.6197916666666665, + "grad_norm": 0.673612117767334, + "learning_rate": 0.0002, + "loss": 1.4463, + "step": 2780 + }, + { + "epoch": 3.6328125, + "grad_norm": 0.6928417086601257, + "learning_rate": 0.0002, + "loss": 1.5231, + "step": 2790 + }, + { + "epoch": 3.6458333333333335, + "grad_norm": 0.6603742837905884, + "learning_rate": 0.0002, + "loss": 1.5212, + "step": 2800 + }, + { + "epoch": 3.6588541666666665, + "grad_norm": 0.5964401960372925, + "learning_rate": 0.0002, + "loss": 1.4889, + "step": 2810 + }, + { + "epoch": 3.671875, + "grad_norm": 0.6224474310874939, + "learning_rate": 0.0002, + "loss": 1.4585, + "step": 2820 + }, + { + "epoch": 3.6848958333333335, + "grad_norm": 0.6592439413070679, + "learning_rate": 0.0002, + "loss": 1.5119, + "step": 2830 + }, + { + "epoch": 3.6979166666666665, + "grad_norm": 0.6255369186401367, + "learning_rate": 0.0002, + "loss": 1.4729, + "step": 2840 + }, + { + "epoch": 3.7109375, + "grad_norm": 0.7136337757110596, + "learning_rate": 0.0002, + "loss": 1.4598, + "step": 2850 + }, + { + "epoch": 3.7239583333333335, + "grad_norm": 0.6229757070541382, + "learning_rate": 0.0002, + "loss": 1.4491, + "step": 2860 + }, + { + "epoch": 3.7369791666666665, + "grad_norm": 0.696080207824707, + "learning_rate": 0.0002, + "loss": 1.4175, + "step": 2870 + }, + { + "epoch": 3.75, + "grad_norm": 0.571873664855957, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2880 + }, + { + "epoch": 3.7630208333333335, + "grad_norm": 0.5918916463851929, + "learning_rate": 0.0002, + "loss": 1.4093, + "step": 2890 + }, + { + "epoch": 3.7760416666666665, + "grad_norm": 0.616413950920105, + "learning_rate": 0.0002, + "loss": 1.399, + "step": 2900 + }, + { + "epoch": 3.7890625, + "grad_norm": 0.6267292499542236, + "learning_rate": 0.0002, + "loss": 1.4215, + "step": 2910 + }, + { + "epoch": 3.8020833333333335, + "grad_norm": 0.6630783677101135, + "learning_rate": 0.0002, + "loss": 1.5095, + "step": 2920 + }, + { + "epoch": 3.8151041666666665, + "grad_norm": 0.6004238724708557, + "learning_rate": 0.0002, + "loss": 1.5323, + "step": 2930 + }, + { + "epoch": 3.828125, + "grad_norm": 0.6740423440933228, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2940 + }, + { + "epoch": 3.8411458333333335, + "grad_norm": 0.6397785544395447, + "learning_rate": 0.0002, + "loss": 1.549, + "step": 2950 + }, + { + "epoch": 3.8541666666666665, + "grad_norm": 0.6063735485076904, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2960 + }, + { + "epoch": 3.8671875, + "grad_norm": 0.6462053060531616, + "learning_rate": 0.0002, + "loss": 1.5093, + "step": 2970 + }, + { + "epoch": 3.8802083333333335, + "grad_norm": 0.7143250107765198, + "learning_rate": 0.0002, + "loss": 1.5237, + "step": 2980 + }, + { + "epoch": 3.8932291666666665, + "grad_norm": 0.6747874617576599, + "learning_rate": 0.0002, + "loss": 1.4419, + "step": 2990 + }, + { + "epoch": 3.90625, + "grad_norm": 0.622930109500885, + "learning_rate": 0.0002, + "loss": 1.5389, + "step": 3000 + }, + { + "epoch": 3.9192708333333335, + "grad_norm": 0.620193600654602, + "learning_rate": 0.0002, + "loss": 1.4279, + "step": 3010 + }, + { + "epoch": 3.9322916666666665, + "grad_norm": 0.6321487426757812, + "learning_rate": 0.0002, + "loss": 1.495, + "step": 3020 + }, + { + "epoch": 3.9453125, + "grad_norm": 0.5705523490905762, + "learning_rate": 0.0002, + "loss": 1.4657, + "step": 3030 + }, + { + "epoch": 3.9583333333333335, + "grad_norm": 0.6185072660446167, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 3040 + }, + { + "epoch": 3.9713541666666665, + "grad_norm": 0.6005704998970032, + "learning_rate": 0.0002, + "loss": 1.4667, + "step": 3050 + }, + { + "epoch": 3.984375, + "grad_norm": 0.5933769941329956, + "learning_rate": 0.0002, + "loss": 1.4896, + "step": 3060 + }, + { + "epoch": 3.9973958333333335, + "grad_norm": 0.695209801197052, + "learning_rate": 0.0002, + "loss": 1.4973, + "step": 3070 + }, + { + "epoch": 4.0, + "eval_loss": 1.8955267667770386, + "eval_runtime": 103.5061, + "eval_samples_per_second": 4.976, + "eval_steps_per_second": 0.628, + "step": 3072 + }, + { + "epoch": 4.010416666666667, + "grad_norm": 0.6706188321113586, + "learning_rate": 0.0002, + "loss": 1.3502, + "step": 3080 + }, + { + "epoch": 4.0234375, + "grad_norm": 0.7263980507850647, + "learning_rate": 0.0002, + "loss": 1.2917, + "step": 3090 + }, + { + "epoch": 4.036458333333333, + "grad_norm": 0.7767240405082703, + "learning_rate": 0.0002, + "loss": 1.2845, + "step": 3100 + }, + { + "epoch": 4.049479166666667, + "grad_norm": 0.6888399124145508, + "learning_rate": 0.0002, + "loss": 1.4169, + "step": 3110 + }, + { + "epoch": 4.0625, + "grad_norm": 0.8860331773757935, + "learning_rate": 0.0002, + "loss": 1.2422, + "step": 3120 + }, + { + "epoch": 4.075520833333333, + "grad_norm": 0.7572373151779175, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3130 + }, + { + "epoch": 4.088541666666667, + "grad_norm": 0.8321536183357239, + "learning_rate": 0.0002, + "loss": 1.2747, + "step": 3140 + }, + { + "epoch": 4.1015625, + "grad_norm": 0.7042664885520935, + "learning_rate": 0.0002, + "loss": 1.2843, + "step": 3150 + }, + { + "epoch": 4.114583333333333, + "grad_norm": 0.8910216689109802, + "learning_rate": 0.0002, + "loss": 1.3326, + "step": 3160 + }, + { + "epoch": 4.127604166666667, + "grad_norm": 0.8333232402801514, + "learning_rate": 0.0002, + "loss": 1.2742, + "step": 3170 + }, + { + "epoch": 4.140625, + "grad_norm": 0.7120883464813232, + "learning_rate": 0.0002, + "loss": 1.2985, + "step": 3180 + }, + { + "epoch": 4.153645833333333, + "grad_norm": 0.6904631853103638, + "learning_rate": 0.0002, + "loss": 1.3611, + "step": 3190 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.6398878693580627, + "learning_rate": 0.0002, + "loss": 1.2881, + "step": 3200 + }, + { + "epoch": 4.1796875, + "grad_norm": 0.7573692798614502, + "learning_rate": 0.0002, + "loss": 1.3323, + "step": 3210 + }, + { + "epoch": 4.192708333333333, + "grad_norm": 0.7850743532180786, + "learning_rate": 0.0002, + "loss": 1.3509, + "step": 3220 + }, + { + "epoch": 4.205729166666667, + "grad_norm": 0.7863165736198425, + "learning_rate": 0.0002, + "loss": 1.3176, + "step": 3230 + }, + { + "epoch": 4.21875, + "grad_norm": 0.7855865359306335, + "learning_rate": 0.0002, + "loss": 1.3739, + "step": 3240 + }, + { + "epoch": 4.231770833333333, + "grad_norm": 0.6840922832489014, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 3250 + }, + { + "epoch": 4.244791666666667, + "grad_norm": 0.8499747514724731, + "learning_rate": 0.0002, + "loss": 1.32, + "step": 3260 + }, + { + "epoch": 4.2578125, + "grad_norm": 0.7982883453369141, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 3270 + }, + { + "epoch": 4.270833333333333, + "grad_norm": 0.7776934504508972, + "learning_rate": 0.0002, + "loss": 1.3922, + "step": 3280 + }, + { + "epoch": 4.283854166666667, + "grad_norm": 0.8887693881988525, + "learning_rate": 0.0002, + "loss": 1.309, + "step": 3290 + }, + { + "epoch": 4.296875, + "grad_norm": 1.0184714794158936, + "learning_rate": 0.0002, + "loss": 1.3213, + "step": 3300 + }, + { + "epoch": 4.309895833333333, + "grad_norm": 0.7539387345314026, + "learning_rate": 0.0002, + "loss": 1.3212, + "step": 3310 + }, + { + "epoch": 4.322916666666667, + "grad_norm": 0.8137491345405579, + "learning_rate": 0.0002, + "loss": 1.3403, + "step": 3320 + }, + { + "epoch": 4.3359375, + "grad_norm": 0.8136276006698608, + "learning_rate": 0.0002, + "loss": 1.3069, + "step": 3330 + }, + { + "epoch": 4.348958333333333, + "grad_norm": 0.7880964279174805, + "learning_rate": 0.0002, + "loss": 1.3512, + "step": 3340 + }, + { + "epoch": 4.361979166666667, + "grad_norm": 0.8654456734657288, + "learning_rate": 0.0002, + "loss": 1.3468, + "step": 3350 + }, + { + "epoch": 4.375, + "grad_norm": 0.8093366622924805, + "learning_rate": 0.0002, + "loss": 1.3036, + "step": 3360 + }, + { + "epoch": 4.388020833333333, + "grad_norm": 0.8738575577735901, + "learning_rate": 0.0002, + "loss": 1.3826, + "step": 3370 + }, + { + "epoch": 4.401041666666667, + "grad_norm": 0.8923026919364929, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3380 + }, + { + "epoch": 4.4140625, + "grad_norm": 0.8508910536766052, + "learning_rate": 0.0002, + "loss": 1.3628, + "step": 3390 + }, + { + "epoch": 4.427083333333333, + "grad_norm": 0.8262084722518921, + "learning_rate": 0.0002, + "loss": 1.3048, + "step": 3400 + }, + { + "epoch": 4.440104166666667, + "grad_norm": 0.7843561768531799, + "learning_rate": 0.0002, + "loss": 1.3145, + "step": 3410 + }, + { + "epoch": 4.453125, + "grad_norm": 0.9087795615196228, + "learning_rate": 0.0002, + "loss": 1.4526, + "step": 3420 + }, + { + "epoch": 4.466145833333333, + "grad_norm": 0.8278809189796448, + "learning_rate": 0.0002, + "loss": 1.3492, + "step": 3430 + }, + { + "epoch": 4.479166666666667, + "grad_norm": 0.8337010741233826, + "learning_rate": 0.0002, + "loss": 1.3797, + "step": 3440 + }, + { + "epoch": 4.4921875, + "grad_norm": 0.7790088057518005, + "learning_rate": 0.0002, + "loss": 1.3199, + "step": 3450 + }, + { + "epoch": 4.505208333333333, + "grad_norm": 0.826231837272644, + "learning_rate": 0.0002, + "loss": 1.3344, + "step": 3460 + }, + { + "epoch": 4.518229166666667, + "grad_norm": 0.761461079120636, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 3470 + }, + { + "epoch": 4.53125, + "grad_norm": 0.8892785906791687, + "learning_rate": 0.0002, + "loss": 1.2829, + "step": 3480 + }, + { + "epoch": 4.544270833333333, + "grad_norm": 0.6087225675582886, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 3490 + }, + { + "epoch": 4.557291666666667, + "grad_norm": 0.8259274363517761, + "learning_rate": 0.0002, + "loss": 1.3167, + "step": 3500 + }, + { + "epoch": 4.5703125, + "grad_norm": 0.821164071559906, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3510 + }, + { + "epoch": 4.583333333333333, + "grad_norm": 0.7262887954711914, + "learning_rate": 0.0002, + "loss": 1.2853, + "step": 3520 + }, + { + "epoch": 4.596354166666667, + "grad_norm": 0.8564826250076294, + "learning_rate": 0.0002, + "loss": 1.3777, + "step": 3530 + }, + { + "epoch": 4.609375, + "grad_norm": 0.8072929978370667, + "learning_rate": 0.0002, + "loss": 1.3238, + "step": 3540 + }, + { + "epoch": 4.622395833333333, + "grad_norm": 0.8040832877159119, + "learning_rate": 0.0002, + "loss": 1.43, + "step": 3550 + }, + { + "epoch": 4.635416666666667, + "grad_norm": 0.7268754839897156, + "learning_rate": 0.0002, + "loss": 1.2863, + "step": 3560 + }, + { + "epoch": 4.6484375, + "grad_norm": 0.9985134601593018, + "learning_rate": 0.0002, + "loss": 1.3485, + "step": 3570 + }, + { + "epoch": 4.661458333333333, + "grad_norm": 0.9826098680496216, + "learning_rate": 0.0002, + "loss": 1.3221, + "step": 3580 + }, + { + "epoch": 4.674479166666667, + "grad_norm": 0.8794422149658203, + "learning_rate": 0.0002, + "loss": 1.2878, + "step": 3590 + }, + { + "epoch": 4.6875, + "grad_norm": 0.7207489609718323, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 3600 + }, + { + "epoch": 4.700520833333333, + "grad_norm": 0.7546059489250183, + "learning_rate": 0.0002, + "loss": 1.3192, + "step": 3610 + }, + { + "epoch": 4.713541666666667, + "grad_norm": 0.8318526148796082, + "learning_rate": 0.0002, + "loss": 1.3445, + "step": 3620 + }, + { + "epoch": 4.7265625, + "grad_norm": 0.7529309391975403, + "learning_rate": 0.0002, + "loss": 1.3847, + "step": 3630 + }, + { + "epoch": 4.739583333333333, + "grad_norm": 0.7762532234191895, + "learning_rate": 0.0002, + "loss": 1.4208, + "step": 3640 + }, + { + "epoch": 4.752604166666667, + "grad_norm": 0.9306083917617798, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 3650 + }, + { + "epoch": 4.765625, + "grad_norm": 0.8050256967544556, + "learning_rate": 0.0002, + "loss": 1.3828, + "step": 3660 + }, + { + "epoch": 4.778645833333333, + "grad_norm": 0.8114449381828308, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3670 + }, + { + "epoch": 4.791666666666667, + "grad_norm": 0.8125811815261841, + "learning_rate": 0.0002, + "loss": 1.3296, + "step": 3680 + }, + { + "epoch": 4.8046875, + "grad_norm": 0.7642565369606018, + "learning_rate": 0.0002, + "loss": 1.3222, + "step": 3690 + }, + { + "epoch": 4.817708333333333, + "grad_norm": 0.8970131874084473, + "learning_rate": 0.0002, + "loss": 1.2842, + "step": 3700 + }, + { + "epoch": 4.830729166666667, + "grad_norm": 0.7654327154159546, + "learning_rate": 0.0002, + "loss": 1.3983, + "step": 3710 + }, + { + "epoch": 4.84375, + "grad_norm": 0.7605378031730652, + "learning_rate": 0.0002, + "loss": 1.3746, + "step": 3720 + }, + { + "epoch": 4.856770833333333, + "grad_norm": 0.8340551257133484, + "learning_rate": 0.0002, + "loss": 1.3149, + "step": 3730 + }, + { + "epoch": 4.869791666666667, + "grad_norm": 0.7273691296577454, + "learning_rate": 0.0002, + "loss": 1.4309, + "step": 3740 + }, + { + "epoch": 4.8828125, + "grad_norm": 0.9718272686004639, + "learning_rate": 0.0002, + "loss": 1.3094, + "step": 3750 + }, + { + "epoch": 4.895833333333333, + "grad_norm": 0.7891847491264343, + "learning_rate": 0.0002, + "loss": 1.296, + "step": 3760 + }, + { + "epoch": 4.908854166666667, + "grad_norm": 0.9090818166732788, + "learning_rate": 0.0002, + "loss": 1.4613, + "step": 3770 + }, + { + "epoch": 4.921875, + "grad_norm": 0.7963318824768066, + "learning_rate": 0.0002, + "loss": 1.3478, + "step": 3780 + }, + { + "epoch": 4.934895833333333, + "grad_norm": 0.7588343620300293, + "learning_rate": 0.0002, + "loss": 1.3558, + "step": 3790 + }, + { + "epoch": 4.947916666666667, + "grad_norm": 0.84076327085495, + "learning_rate": 0.0002, + "loss": 1.3664, + "step": 3800 + }, + { + "epoch": 4.9609375, + "grad_norm": 0.7767227292060852, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 3810 + }, + { + "epoch": 4.973958333333333, + "grad_norm": 0.8101866245269775, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3820 + }, + { + "epoch": 4.986979166666667, + "grad_norm": 0.7808696627616882, + "learning_rate": 0.0002, + "loss": 1.3881, + "step": 3830 + }, + { + "epoch": 5.0, + "grad_norm": 0.9609483480453491, + "learning_rate": 0.0002, + "loss": 1.4475, + "step": 3840 + }, + { + "epoch": 5.0, + "eval_loss": 1.9610719680786133, + "eval_runtime": 87.6572, + "eval_samples_per_second": 5.875, + "eval_steps_per_second": 0.742, + "step": 3840 + }, + { + "epoch": 5.013020833333333, + "grad_norm": 0.9366803765296936, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 3850 + }, + { + "epoch": 5.026041666666667, + "grad_norm": 0.8014302849769592, + "learning_rate": 0.0002, + "loss": 1.1931, + "step": 3860 + }, + { + "epoch": 5.0390625, + "grad_norm": 0.977936863899231, + "learning_rate": 0.0002, + "loss": 1.1418, + "step": 3870 + }, + { + "epoch": 5.052083333333333, + "grad_norm": 1.045047640800476, + "learning_rate": 0.0002, + "loss": 1.1258, + "step": 3880 + }, + { + "epoch": 5.065104166666667, + "grad_norm": 1.125620722770691, + "learning_rate": 0.0002, + "loss": 1.1709, + "step": 3890 + }, + { + "epoch": 5.078125, + "grad_norm": 1.1565124988555908, + "learning_rate": 0.0002, + "loss": 1.1954, + "step": 3900 + }, + { + "epoch": 5.091145833333333, + "grad_norm": 1.102354884147644, + "learning_rate": 0.0002, + "loss": 1.1753, + "step": 3910 + }, + { + "epoch": 5.104166666666667, + "grad_norm": 0.9567629098892212, + "learning_rate": 0.0002, + "loss": 1.1632, + "step": 3920 + }, + { + "epoch": 5.1171875, + "grad_norm": 0.9760252833366394, + "learning_rate": 0.0002, + "loss": 1.1875, + "step": 3930 + }, + { + "epoch": 5.130208333333333, + "grad_norm": 1.026168704032898, + "learning_rate": 0.0002, + "loss": 1.2289, + "step": 3940 + }, + { + "epoch": 5.143229166666667, + "grad_norm": 1.1490436792373657, + "learning_rate": 0.0002, + "loss": 1.1598, + "step": 3950 + }, + { + "epoch": 5.15625, + "grad_norm": 0.9712087512016296, + "learning_rate": 0.0002, + "loss": 1.0823, + "step": 3960 + }, + { + "epoch": 5.169270833333333, + "grad_norm": 1.0095003843307495, + "learning_rate": 0.0002, + "loss": 1.1948, + "step": 3970 + }, + { + "epoch": 5.182291666666667, + "grad_norm": 0.9171855449676514, + "learning_rate": 0.0002, + "loss": 1.1617, + "step": 3980 + }, + { + "epoch": 5.1953125, + "grad_norm": 1.0105657577514648, + "learning_rate": 0.0002, + "loss": 1.161, + "step": 3990 + }, + { + "epoch": 5.208333333333333, + "grad_norm": 1.0330145359039307, + "learning_rate": 0.0002, + "loss": 1.2098, + "step": 4000 + }, + { + "epoch": 5.221354166666667, + "grad_norm": 1.0676906108856201, + "learning_rate": 0.0002, + "loss": 1.1965, + "step": 4010 + }, + { + "epoch": 5.234375, + "grad_norm": 1.055088758468628, + "learning_rate": 0.0002, + "loss": 1.1392, + "step": 4020 + }, + { + "epoch": 5.247395833333333, + "grad_norm": 0.9523683786392212, + "learning_rate": 0.0002, + "loss": 1.2173, + "step": 4030 + }, + { + "epoch": 5.260416666666667, + "grad_norm": 0.9013799428939819, + "learning_rate": 0.0002, + "loss": 1.1167, + "step": 4040 + }, + { + "epoch": 5.2734375, + "grad_norm": 0.9379037618637085, + "learning_rate": 0.0002, + "loss": 1.2274, + "step": 4050 + }, + { + "epoch": 5.286458333333333, + "grad_norm": 0.9565327763557434, + "learning_rate": 0.0002, + "loss": 1.1246, + "step": 4060 + }, + { + "epoch": 5.299479166666667, + "grad_norm": 1.1994404792785645, + "learning_rate": 0.0002, + "loss": 1.2103, + "step": 4070 + }, + { + "epoch": 5.3125, + "grad_norm": 1.0563262701034546, + "learning_rate": 0.0002, + "loss": 1.2016, + "step": 4080 + }, + { + "epoch": 5.325520833333333, + "grad_norm": 1.024290680885315, + "learning_rate": 0.0002, + "loss": 1.2478, + "step": 4090 + }, + { + "epoch": 5.338541666666667, + "grad_norm": 1.0022907257080078, + "learning_rate": 0.0002, + "loss": 1.2388, + "step": 4100 + }, + { + "epoch": 5.3515625, + "grad_norm": 0.9642180800437927, + "learning_rate": 0.0002, + "loss": 1.1948, + "step": 4110 + }, + { + "epoch": 5.364583333333333, + "grad_norm": 1.0228009223937988, + "learning_rate": 0.0002, + "loss": 1.231, + "step": 4120 + }, + { + "epoch": 5.377604166666667, + "grad_norm": 1.0379719734191895, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4130 + }, + { + "epoch": 5.390625, + "grad_norm": 1.147053599357605, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 4140 + }, + { + "epoch": 5.403645833333333, + "grad_norm": 1.2097876071929932, + "learning_rate": 0.0002, + "loss": 1.2026, + "step": 4150 + }, + { + "epoch": 5.416666666666667, + "grad_norm": 1.0852497816085815, + "learning_rate": 0.0002, + "loss": 1.1978, + "step": 4160 + }, + { + "epoch": 5.4296875, + "grad_norm": 0.9765135645866394, + "learning_rate": 0.0002, + "loss": 1.2182, + "step": 4170 + }, + { + "epoch": 5.442708333333333, + "grad_norm": 1.0180606842041016, + "learning_rate": 0.0002, + "loss": 1.3117, + "step": 4180 + }, + { + "epoch": 5.455729166666667, + "grad_norm": 1.185409665107727, + "learning_rate": 0.0002, + "loss": 1.2355, + "step": 4190 + }, + { + "epoch": 5.46875, + "grad_norm": 0.9363358020782471, + "learning_rate": 0.0002, + "loss": 1.1531, + "step": 4200 + }, + { + "epoch": 5.481770833333333, + "grad_norm": 1.0761215686798096, + "learning_rate": 0.0002, + "loss": 1.1645, + "step": 4210 + }, + { + "epoch": 5.494791666666667, + "grad_norm": 1.057626724243164, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 4220 + }, + { + "epoch": 5.5078125, + "grad_norm": 1.0103157758712769, + "learning_rate": 0.0002, + "loss": 1.2051, + "step": 4230 + }, + { + "epoch": 5.520833333333333, + "grad_norm": 1.1056627035140991, + "learning_rate": 0.0002, + "loss": 1.2193, + "step": 4240 + }, + { + "epoch": 5.533854166666667, + "grad_norm": 1.0256257057189941, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 4250 + }, + { + "epoch": 5.546875, + "grad_norm": 1.2814106941223145, + "learning_rate": 0.0002, + "loss": 1.1724, + "step": 4260 + }, + { + "epoch": 5.559895833333333, + "grad_norm": 0.9044927954673767, + "learning_rate": 0.0002, + "loss": 1.1676, + "step": 4270 + }, + { + "epoch": 5.572916666666667, + "grad_norm": 0.9870165586471558, + "learning_rate": 0.0002, + "loss": 1.2448, + "step": 4280 + }, + { + "epoch": 5.5859375, + "grad_norm": 0.9867369532585144, + "learning_rate": 0.0002, + "loss": 1.2414, + "step": 4290 + }, + { + "epoch": 5.598958333333333, + "grad_norm": 1.045625925064087, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 4300 + }, + { + "epoch": 5.611979166666667, + "grad_norm": 0.979853630065918, + "learning_rate": 0.0002, + "loss": 1.2786, + "step": 4310 + }, + { + "epoch": 5.625, + "grad_norm": 1.029212236404419, + "learning_rate": 0.0002, + "loss": 1.1629, + "step": 4320 + }, + { + "epoch": 5.638020833333333, + "grad_norm": 1.0348633527755737, + "learning_rate": 0.0002, + "loss": 1.1985, + "step": 4330 + }, + { + "epoch": 5.651041666666667, + "grad_norm": 1.0055185556411743, + "learning_rate": 0.0002, + "loss": 1.1914, + "step": 4340 + }, + { + "epoch": 5.6640625, + "grad_norm": 0.9312447309494019, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 4350 + }, + { + "epoch": 5.677083333333333, + "grad_norm": 1.1411694288253784, + "learning_rate": 0.0002, + "loss": 1.1901, + "step": 4360 + }, + { + "epoch": 5.690104166666667, + "grad_norm": 0.9764434695243835, + "learning_rate": 0.0002, + "loss": 1.2679, + "step": 4370 + }, + { + "epoch": 5.703125, + "grad_norm": 1.079154133796692, + "learning_rate": 0.0002, + "loss": 1.2215, + "step": 4380 + }, + { + "epoch": 5.716145833333333, + "grad_norm": 0.999526858329773, + "learning_rate": 0.0002, + "loss": 1.1659, + "step": 4390 + }, + { + "epoch": 5.729166666666667, + "grad_norm": 1.1239734888076782, + "learning_rate": 0.0002, + "loss": 1.1685, + "step": 4400 + }, + { + "epoch": 5.7421875, + "grad_norm": 1.0539512634277344, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 4410 + }, + { + "epoch": 5.755208333333333, + "grad_norm": 0.9884052872657776, + "learning_rate": 0.0002, + "loss": 1.1413, + "step": 4420 + }, + { + "epoch": 5.768229166666667, + "grad_norm": 0.9821958541870117, + "learning_rate": 0.0002, + "loss": 1.1781, + "step": 4430 + }, + { + "epoch": 5.78125, + "grad_norm": 0.9340839982032776, + "learning_rate": 0.0002, + "loss": 1.2319, + "step": 4440 + }, + { + "epoch": 5.794270833333333, + "grad_norm": 0.9935781955718994, + "learning_rate": 0.0002, + "loss": 1.3085, + "step": 4450 + }, + { + "epoch": 5.807291666666667, + "grad_norm": 1.1027121543884277, + "learning_rate": 0.0002, + "loss": 1.1726, + "step": 4460 + }, + { + "epoch": 5.8203125, + "grad_norm": 0.9388337135314941, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 4470 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 1.0957310199737549, + "learning_rate": 0.0002, + "loss": 1.259, + "step": 4480 + }, + { + "epoch": 5.846354166666667, + "grad_norm": 1.0832754373550415, + "learning_rate": 0.0002, + "loss": 1.3017, + "step": 4490 + }, + { + "epoch": 5.859375, + "grad_norm": 0.9498379826545715, + "learning_rate": 0.0002, + "loss": 1.1724, + "step": 4500 + }, + { + "epoch": 5.872395833333333, + "grad_norm": 0.9104725122451782, + "learning_rate": 0.0002, + "loss": 1.2312, + "step": 4510 + }, + { + "epoch": 5.885416666666667, + "grad_norm": 1.2238177061080933, + "learning_rate": 0.0002, + "loss": 1.204, + "step": 4520 + }, + { + "epoch": 5.8984375, + "grad_norm": 1.0549527406692505, + "learning_rate": 0.0002, + "loss": 1.2163, + "step": 4530 + }, + { + "epoch": 5.911458333333333, + "grad_norm": 1.0415066480636597, + "learning_rate": 0.0002, + "loss": 1.3086, + "step": 4540 + }, + { + "epoch": 5.924479166666667, + "grad_norm": 0.9098646640777588, + "learning_rate": 0.0002, + "loss": 1.1744, + "step": 4550 + }, + { + "epoch": 5.9375, + "grad_norm": 0.9182857275009155, + "learning_rate": 0.0002, + "loss": 1.2126, + "step": 4560 + }, + { + "epoch": 5.950520833333333, + "grad_norm": 1.088038444519043, + "learning_rate": 0.0002, + "loss": 1.2341, + "step": 4570 + }, + { + "epoch": 5.963541666666667, + "grad_norm": 1.1331020593643188, + "learning_rate": 0.0002, + "loss": 1.2317, + "step": 4580 + }, + { + "epoch": 5.9765625, + "grad_norm": 0.9592235088348389, + "learning_rate": 0.0002, + "loss": 1.2318, + "step": 4590 + }, + { + "epoch": 5.989583333333333, + "grad_norm": 1.0126368999481201, + "learning_rate": 0.0002, + "loss": 1.1995, + "step": 4600 + }, + { + "epoch": 6.0, + "eval_loss": 2.096651315689087, + "eval_runtime": 43.1936, + "eval_samples_per_second": 11.923, + "eval_steps_per_second": 1.505, + "step": 4608 + }, + { + "epoch": 6.002604166666667, + "grad_norm": 1.0549334287643433, + "learning_rate": 0.0002, + "loss": 1.2061, + "step": 4610 + }, + { + "epoch": 6.015625, + "grad_norm": 1.099247694015503, + "learning_rate": 0.0002, + "loss": 1.0046, + "step": 4620 + }, + { + "epoch": 6.028645833333333, + "grad_norm": 1.0992592573165894, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 4630 + }, + { + "epoch": 6.041666666666667, + "grad_norm": 1.139350414276123, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 4640 + }, + { + "epoch": 6.0546875, + "grad_norm": 1.1316219568252563, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 4650 + }, + { + "epoch": 6.067708333333333, + "grad_norm": 1.5254799127578735, + "learning_rate": 0.0002, + "loss": 1.05, + "step": 4660 + }, + { + "epoch": 6.080729166666667, + "grad_norm": 1.155513048171997, + "learning_rate": 0.0002, + "loss": 1.0357, + "step": 4670 + }, + { + "epoch": 6.09375, + "grad_norm": 1.311339259147644, + "learning_rate": 0.0002, + "loss": 1.0782, + "step": 4680 + }, + { + "epoch": 6.106770833333333, + "grad_norm": 0.9942600131034851, + "learning_rate": 0.0002, + "loss": 1.098, + "step": 4690 + }, + { + "epoch": 6.119791666666667, + "grad_norm": 1.388214111328125, + "learning_rate": 0.0002, + "loss": 0.9989, + "step": 4700 + }, + { + "epoch": 6.1328125, + "grad_norm": 1.260488510131836, + "learning_rate": 0.0002, + "loss": 1.0893, + "step": 4710 + }, + { + "epoch": 6.145833333333333, + "grad_norm": 1.231615662574768, + "learning_rate": 0.0002, + "loss": 1.0225, + "step": 4720 + }, + { + "epoch": 6.158854166666667, + "grad_norm": 1.049696922302246, + "learning_rate": 0.0002, + "loss": 1.0547, + "step": 4730 + }, + { + "epoch": 6.171875, + "grad_norm": 1.145426869392395, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 4740 + }, + { + "epoch": 6.184895833333333, + "grad_norm": 1.1715868711471558, + "learning_rate": 0.0002, + "loss": 1.0751, + "step": 4750 + }, + { + "epoch": 6.197916666666667, + "grad_norm": 1.2575212717056274, + "learning_rate": 0.0002, + "loss": 0.9901, + "step": 4760 + }, + { + "epoch": 6.2109375, + "grad_norm": 1.2996530532836914, + "learning_rate": 0.0002, + "loss": 0.9775, + "step": 4770 + }, + { + "epoch": 6.223958333333333, + "grad_norm": 1.4030718803405762, + "learning_rate": 0.0002, + "loss": 1.0227, + "step": 4780 + }, + { + "epoch": 6.236979166666667, + "grad_norm": 1.2140913009643555, + "learning_rate": 0.0002, + "loss": 1.0439, + "step": 4790 + }, + { + "epoch": 6.25, + "grad_norm": 1.3512893915176392, + "learning_rate": 0.0002, + "loss": 1.0637, + "step": 4800 + }, + { + "epoch": 6.263020833333333, + "grad_norm": 1.1931439638137817, + "learning_rate": 0.0002, + "loss": 1.0367, + "step": 4810 + }, + { + "epoch": 6.276041666666667, + "grad_norm": 1.0379345417022705, + "learning_rate": 0.0002, + "loss": 1.0615, + "step": 4820 + }, + { + "epoch": 6.2890625, + "grad_norm": 1.1571568250656128, + "learning_rate": 0.0002, + "loss": 1.0954, + "step": 4830 + }, + { + "epoch": 6.302083333333333, + "grad_norm": 1.0717264413833618, + "learning_rate": 0.0002, + "loss": 1.0029, + "step": 4840 + }, + { + "epoch": 6.315104166666667, + "grad_norm": 1.360496997833252, + "learning_rate": 0.0002, + "loss": 1.0466, + "step": 4850 + }, + { + "epoch": 6.328125, + "grad_norm": 1.0864052772521973, + "learning_rate": 0.0002, + "loss": 1.001, + "step": 4860 + }, + { + "epoch": 6.341145833333333, + "grad_norm": 1.3391871452331543, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 4870 + }, + { + "epoch": 6.354166666666667, + "grad_norm": 1.2568541765213013, + "learning_rate": 0.0002, + "loss": 1.0797, + "step": 4880 + }, + { + "epoch": 6.3671875, + "grad_norm": 1.255483627319336, + "learning_rate": 0.0002, + "loss": 1.1076, + "step": 4890 + }, + { + "epoch": 6.380208333333333, + "grad_norm": 1.173972487449646, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 4900 + }, + { + "epoch": 6.393229166666667, + "grad_norm": 1.14010488986969, + "learning_rate": 0.0002, + "loss": 1.0238, + "step": 4910 + }, + { + "epoch": 6.40625, + "grad_norm": 1.1317493915557861, + "learning_rate": 0.0002, + "loss": 1.0319, + "step": 4920 + }, + { + "epoch": 6.419270833333333, + "grad_norm": 1.1547486782073975, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 4930 + }, + { + "epoch": 6.432291666666667, + "grad_norm": 1.1822998523712158, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 4940 + }, + { + "epoch": 6.4453125, + "grad_norm": 1.1865756511688232, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 4950 + }, + { + "epoch": 6.458333333333333, + "grad_norm": 1.13661789894104, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 4960 + }, + { + "epoch": 6.471354166666667, + "grad_norm": 1.047326683998108, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 4970 + }, + { + "epoch": 6.484375, + "grad_norm": 1.3550827503204346, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 4980 + }, + { + "epoch": 6.497395833333333, + "grad_norm": 1.2868435382843018, + "learning_rate": 0.0002, + "loss": 1.0984, + "step": 4990 + }, + { + "epoch": 6.510416666666667, + "grad_norm": 1.4678666591644287, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 5000 + }, + { + "epoch": 6.5234375, + "grad_norm": 1.3739159107208252, + "learning_rate": 0.0002, + "loss": 1.076, + "step": 5010 + }, + { + "epoch": 6.536458333333333, + "grad_norm": 1.213034987449646, + "learning_rate": 0.0002, + "loss": 1.046, + "step": 5020 + }, + { + "epoch": 6.549479166666667, + "grad_norm": 1.5025049448013306, + "learning_rate": 0.0002, + "loss": 1.1129, + "step": 5030 + }, + { + "epoch": 6.5625, + "grad_norm": 1.1811821460723877, + "learning_rate": 0.0002, + "loss": 1.0564, + "step": 5040 + }, + { + "epoch": 6.575520833333333, + "grad_norm": 1.2845960855484009, + "learning_rate": 0.0002, + "loss": 1.1096, + "step": 5050 + }, + { + "epoch": 6.588541666666667, + "grad_norm": 1.0641103982925415, + "learning_rate": 0.0002, + "loss": 1.0274, + "step": 5060 + }, + { + "epoch": 6.6015625, + "grad_norm": 1.0967134237289429, + "learning_rate": 0.0002, + "loss": 1.0559, + "step": 5070 + }, + { + "epoch": 6.614583333333333, + "grad_norm": 1.1802116632461548, + "learning_rate": 0.0002, + "loss": 1.0965, + "step": 5080 + }, + { + "epoch": 6.627604166666667, + "grad_norm": 1.3110308647155762, + "learning_rate": 0.0002, + "loss": 1.0296, + "step": 5090 + }, + { + "epoch": 6.640625, + "grad_norm": 1.1863301992416382, + "learning_rate": 0.0002, + "loss": 1.0273, + "step": 5100 + }, + { + "epoch": 6.653645833333333, + "grad_norm": 1.0931109189987183, + "learning_rate": 0.0002, + "loss": 1.1355, + "step": 5110 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.0571614503860474, + "learning_rate": 0.0002, + "loss": 1.1025, + "step": 5120 + }, + { + "epoch": 6.6796875, + "grad_norm": 1.2855656147003174, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 5130 + }, + { + "epoch": 6.692708333333333, + "grad_norm": 1.2217806577682495, + "learning_rate": 0.0002, + "loss": 1.0582, + "step": 5140 + }, + { + "epoch": 6.705729166666667, + "grad_norm": 1.093658447265625, + "learning_rate": 0.0002, + "loss": 1.1098, + "step": 5150 + }, + { + "epoch": 6.71875, + "grad_norm": 1.2592076063156128, + "learning_rate": 0.0002, + "loss": 1.0845, + "step": 5160 + }, + { + "epoch": 6.731770833333333, + "grad_norm": 1.0720105171203613, + "learning_rate": 0.0002, + "loss": 1.0381, + "step": 5170 + }, + { + "epoch": 6.744791666666667, + "grad_norm": 1.178058385848999, + "learning_rate": 0.0002, + "loss": 1.0707, + "step": 5180 + }, + { + "epoch": 6.7578125, + "grad_norm": 1.1897447109222412, + "learning_rate": 0.0002, + "loss": 1.116, + "step": 5190 + }, + { + "epoch": 6.770833333333333, + "grad_norm": 1.3547686338424683, + "learning_rate": 0.0002, + "loss": 1.064, + "step": 5200 + }, + { + "epoch": 6.783854166666667, + "grad_norm": 1.2514727115631104, + "learning_rate": 0.0002, + "loss": 1.0642, + "step": 5210 + }, + { + "epoch": 6.796875, + "grad_norm": 1.5253846645355225, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 5220 + }, + { + "epoch": 6.809895833333333, + "grad_norm": 1.090774655342102, + "learning_rate": 0.0002, + "loss": 1.0426, + "step": 5230 + }, + { + "epoch": 6.822916666666667, + "grad_norm": 1.1387991905212402, + "learning_rate": 0.0002, + "loss": 1.0867, + "step": 5240 + }, + { + "epoch": 6.8359375, + "grad_norm": 1.102423906326294, + "learning_rate": 0.0002, + "loss": 1.0493, + "step": 5250 + }, + { + "epoch": 6.848958333333333, + "grad_norm": 1.2453415393829346, + "learning_rate": 0.0002, + "loss": 1.0976, + "step": 5260 + }, + { + "epoch": 6.861979166666667, + "grad_norm": 1.2541141510009766, + "learning_rate": 0.0002, + "loss": 1.1046, + "step": 5270 + }, + { + "epoch": 6.875, + "grad_norm": 1.2719744443893433, + "learning_rate": 0.0002, + "loss": 1.0816, + "step": 5280 + }, + { + "epoch": 6.888020833333333, + "grad_norm": 1.085763931274414, + "learning_rate": 0.0002, + "loss": 1.0399, + "step": 5290 + }, + { + "epoch": 6.901041666666667, + "grad_norm": 1.2399879693984985, + "learning_rate": 0.0002, + "loss": 1.1306, + "step": 5300 + }, + { + "epoch": 6.9140625, + "grad_norm": 1.244888424873352, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 5310 + }, + { + "epoch": 6.927083333333333, + "grad_norm": 1.1424126625061035, + "learning_rate": 0.0002, + "loss": 1.0868, + "step": 5320 + }, + { + "epoch": 6.940104166666667, + "grad_norm": 1.1804956197738647, + "learning_rate": 0.0002, + "loss": 1.0768, + "step": 5330 + }, + { + "epoch": 6.953125, + "grad_norm": 1.3943406343460083, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 5340 + }, + { + "epoch": 6.966145833333333, + "grad_norm": 1.3278584480285645, + "learning_rate": 0.0002, + "loss": 1.0573, + "step": 5350 + }, + { + "epoch": 6.979166666666667, + "grad_norm": 1.3579362630844116, + "learning_rate": 0.0002, + "loss": 1.1008, + "step": 5360 + }, + { + "epoch": 6.9921875, + "grad_norm": 1.2172175645828247, + "learning_rate": 0.0002, + "loss": 1.059, + "step": 5370 + }, + { + "epoch": 7.0, + "eval_loss": 2.200756549835205, + "eval_runtime": 42.8258, + "eval_samples_per_second": 12.025, + "eval_steps_per_second": 1.518, + "step": 5376 + }, + { + "epoch": 7.005208333333333, + "grad_norm": 1.175237774848938, + "learning_rate": 0.0002, + "loss": 1.0272, + "step": 5380 + }, + { + "epoch": 7.018229166666667, + "grad_norm": 1.3215409517288208, + "learning_rate": 0.0002, + "loss": 0.8889, + "step": 5390 + }, + { + "epoch": 7.03125, + "grad_norm": 1.5751091241836548, + "learning_rate": 0.0002, + "loss": 0.8849, + "step": 5400 + }, + { + "epoch": 7.044270833333333, + "grad_norm": 1.390234351158142, + "learning_rate": 0.0002, + "loss": 0.8786, + "step": 5410 + }, + { + "epoch": 7.057291666666667, + "grad_norm": 1.3558553457260132, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5420 + }, + { + "epoch": 7.0703125, + "grad_norm": 1.4664019346237183, + "learning_rate": 0.0002, + "loss": 0.8951, + "step": 5430 + }, + { + "epoch": 7.083333333333333, + "grad_norm": 1.5194770097732544, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 5440 + }, + { + "epoch": 7.096354166666667, + "grad_norm": 1.2315709590911865, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 5450 + }, + { + "epoch": 7.109375, + "grad_norm": 1.4849501848220825, + "learning_rate": 0.0002, + "loss": 0.847, + "step": 5460 + }, + { + "epoch": 7.122395833333333, + "grad_norm": 1.471713662147522, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 5470 + }, + { + "epoch": 7.135416666666667, + "grad_norm": 1.5665255784988403, + "learning_rate": 0.0002, + "loss": 0.8852, + "step": 5480 + }, + { + "epoch": 7.1484375, + "grad_norm": 1.796554446220398, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 5490 + }, + { + "epoch": 7.161458333333333, + "grad_norm": 1.3455413579940796, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 5500 + }, + { + "epoch": 7.174479166666667, + "grad_norm": 1.5465866327285767, + "learning_rate": 0.0002, + "loss": 0.8884, + "step": 5510 + }, + { + "epoch": 7.1875, + "grad_norm": 1.42877197265625, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 5520 + }, + { + "epoch": 7.200520833333333, + "grad_norm": 1.5602610111236572, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 5530 + }, + { + "epoch": 7.213541666666667, + "grad_norm": 1.3813670873641968, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 5540 + }, + { + "epoch": 7.2265625, + "grad_norm": 1.2724273204803467, + "learning_rate": 0.0002, + "loss": 0.8852, + "step": 5550 + }, + { + "epoch": 7.239583333333333, + "grad_norm": 1.3184880018234253, + "learning_rate": 0.0002, + "loss": 0.8677, + "step": 5560 + }, + { + "epoch": 7.252604166666667, + "grad_norm": 1.3192334175109863, + "learning_rate": 0.0002, + "loss": 0.9141, + "step": 5570 + }, + { + "epoch": 7.265625, + "grad_norm": 1.6324747800827026, + "learning_rate": 0.0002, + "loss": 0.9027, + "step": 5580 + }, + { + "epoch": 7.278645833333333, + "grad_norm": 1.5259788036346436, + "learning_rate": 0.0002, + "loss": 0.9024, + "step": 5590 + }, + { + "epoch": 7.291666666666667, + "grad_norm": 1.397698163986206, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 5600 + }, + { + "epoch": 7.3046875, + "grad_norm": 1.5079587697982788, + "learning_rate": 0.0002, + "loss": 0.9571, + "step": 5610 + }, + { + "epoch": 7.317708333333333, + "grad_norm": 1.3299126625061035, + "learning_rate": 0.0002, + "loss": 0.9091, + "step": 5620 + }, + { + "epoch": 7.330729166666667, + "grad_norm": 1.4264276027679443, + "learning_rate": 0.0002, + "loss": 0.903, + "step": 5630 + }, + { + "epoch": 7.34375, + "grad_norm": 1.3027597665786743, + "learning_rate": 0.0002, + "loss": 0.8602, + "step": 5640 + }, + { + "epoch": 7.356770833333333, + "grad_norm": 1.3218268156051636, + "learning_rate": 0.0002, + "loss": 0.9054, + "step": 5650 + }, + { + "epoch": 7.369791666666667, + "grad_norm": 1.346595048904419, + "learning_rate": 0.0002, + "loss": 0.986, + "step": 5660 + }, + { + "epoch": 7.3828125, + "grad_norm": 1.5836858749389648, + "learning_rate": 0.0002, + "loss": 0.983, + "step": 5670 + }, + { + "epoch": 7.395833333333333, + "grad_norm": 1.586815595626831, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 5680 + }, + { + "epoch": 7.408854166666667, + "grad_norm": 1.8045003414154053, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 5690 + }, + { + "epoch": 7.421875, + "grad_norm": 1.5418108701705933, + "learning_rate": 0.0002, + "loss": 0.9479, + "step": 5700 + }, + { + "epoch": 7.434895833333333, + "grad_norm": 1.426693081855774, + "learning_rate": 0.0002, + "loss": 0.9209, + "step": 5710 + }, + { + "epoch": 7.447916666666667, + "grad_norm": 1.3240386247634888, + "learning_rate": 0.0002, + "loss": 0.8707, + "step": 5720 + }, + { + "epoch": 7.4609375, + "grad_norm": 1.26353919506073, + "learning_rate": 0.0002, + "loss": 0.9101, + "step": 5730 + }, + { + "epoch": 7.473958333333333, + "grad_norm": 1.5816353559494019, + "learning_rate": 0.0002, + "loss": 0.9342, + "step": 5740 + }, + { + "epoch": 7.486979166666667, + "grad_norm": 1.133431315422058, + "learning_rate": 0.0002, + "loss": 0.9493, + "step": 5750 + }, + { + "epoch": 7.5, + "grad_norm": 1.3449418544769287, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 5760 + }, + { + "epoch": 7.513020833333333, + "grad_norm": 1.288068175315857, + "learning_rate": 0.0002, + "loss": 0.9176, + "step": 5770 + }, + { + "epoch": 7.526041666666667, + "grad_norm": 1.4578267335891724, + "learning_rate": 0.0002, + "loss": 0.9546, + "step": 5780 + }, + { + "epoch": 7.5390625, + "grad_norm": 1.423254370689392, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 5790 + }, + { + "epoch": 7.552083333333333, + "grad_norm": 1.2016581296920776, + "learning_rate": 0.0002, + "loss": 0.9747, + "step": 5800 + }, + { + "epoch": 7.565104166666667, + "grad_norm": 1.7114553451538086, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 5810 + }, + { + "epoch": 7.578125, + "grad_norm": 1.5403549671173096, + "learning_rate": 0.0002, + "loss": 0.9398, + "step": 5820 + }, + { + "epoch": 7.591145833333333, + "grad_norm": 1.324479341506958, + "learning_rate": 0.0002, + "loss": 0.9186, + "step": 5830 + }, + { + "epoch": 7.604166666666667, + "grad_norm": 1.4195842742919922, + "learning_rate": 0.0002, + "loss": 0.9007, + "step": 5840 + }, + { + "epoch": 7.6171875, + "grad_norm": 1.2824413776397705, + "learning_rate": 0.0002, + "loss": 0.9223, + "step": 5850 + }, + { + "epoch": 7.630208333333333, + "grad_norm": 1.4113891124725342, + "learning_rate": 0.0002, + "loss": 0.9674, + "step": 5860 + }, + { + "epoch": 7.643229166666667, + "grad_norm": 1.425513744354248, + "learning_rate": 0.0002, + "loss": 0.969, + "step": 5870 + }, + { + "epoch": 7.65625, + "grad_norm": 1.369148850440979, + "learning_rate": 0.0002, + "loss": 0.9201, + "step": 5880 + }, + { + "epoch": 7.669270833333333, + "grad_norm": 1.2715039253234863, + "learning_rate": 0.0002, + "loss": 0.9857, + "step": 5890 + }, + { + "epoch": 7.682291666666667, + "grad_norm": 1.5072753429412842, + "learning_rate": 0.0002, + "loss": 0.9278, + "step": 5900 + }, + { + "epoch": 7.6953125, + "grad_norm": 1.2748578786849976, + "learning_rate": 0.0002, + "loss": 0.9552, + "step": 5910 + }, + { + "epoch": 7.708333333333333, + "grad_norm": 1.4645745754241943, + "learning_rate": 0.0002, + "loss": 0.9677, + "step": 5920 + }, + { + "epoch": 7.721354166666667, + "grad_norm": 1.410602331161499, + "learning_rate": 0.0002, + "loss": 0.9259, + "step": 5930 + }, + { + "epoch": 7.734375, + "grad_norm": 1.4340840578079224, + "learning_rate": 0.0002, + "loss": 0.9688, + "step": 5940 + }, + { + "epoch": 7.747395833333333, + "grad_norm": 1.4908568859100342, + "learning_rate": 0.0002, + "loss": 0.9063, + "step": 5950 + }, + { + "epoch": 7.760416666666667, + "grad_norm": 1.6938505172729492, + "learning_rate": 0.0002, + "loss": 1.0224, + "step": 5960 + }, + { + "epoch": 7.7734375, + "grad_norm": 1.5617954730987549, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 5970 + }, + { + "epoch": 7.786458333333333, + "grad_norm": 1.4071742296218872, + "learning_rate": 0.0002, + "loss": 0.9556, + "step": 5980 + }, + { + "epoch": 7.799479166666667, + "grad_norm": 1.517405390739441, + "learning_rate": 0.0002, + "loss": 0.9493, + "step": 5990 + }, + { + "epoch": 7.8125, + "grad_norm": 1.4399837255477905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 6000 + }, + { + "epoch": 7.825520833333333, + "grad_norm": 1.4359688758850098, + "learning_rate": 0.0002, + "loss": 1.0497, + "step": 6010 + }, + { + "epoch": 7.838541666666667, + "grad_norm": 1.4709250926971436, + "learning_rate": 0.0002, + "loss": 0.9701, + "step": 6020 + }, + { + "epoch": 7.8515625, + "grad_norm": 1.185585379600525, + "learning_rate": 0.0002, + "loss": 0.9785, + "step": 6030 + }, + { + "epoch": 7.864583333333333, + "grad_norm": 1.3034945726394653, + "learning_rate": 0.0002, + "loss": 0.9425, + "step": 6040 + }, + { + "epoch": 7.877604166666667, + "grad_norm": 1.609330654144287, + "learning_rate": 0.0002, + "loss": 0.9651, + "step": 6050 + }, + { + "epoch": 7.890625, + "grad_norm": 1.416290521621704, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 6060 + }, + { + "epoch": 7.903645833333333, + "grad_norm": 1.58739173412323, + "learning_rate": 0.0002, + "loss": 0.9711, + "step": 6070 + }, + { + "epoch": 7.916666666666667, + "grad_norm": 1.2414129972457886, + "learning_rate": 0.0002, + "loss": 0.9317, + "step": 6080 + }, + { + "epoch": 7.9296875, + "grad_norm": 1.7573872804641724, + "learning_rate": 0.0002, + "loss": 1.0034, + "step": 6090 + }, + { + "epoch": 7.942708333333333, + "grad_norm": 1.514016032218933, + "learning_rate": 0.0002, + "loss": 0.9231, + "step": 6100 + }, + { + "epoch": 7.955729166666667, + "grad_norm": 1.292657494544983, + "learning_rate": 0.0002, + "loss": 0.9818, + "step": 6110 + }, + { + "epoch": 7.96875, + "grad_norm": 1.4027271270751953, + "learning_rate": 0.0002, + "loss": 0.9517, + "step": 6120 + }, + { + "epoch": 7.981770833333333, + "grad_norm": 1.4184486865997314, + "learning_rate": 0.0002, + "loss": 0.9159, + "step": 6130 + }, + { + "epoch": 7.994791666666667, + "grad_norm": 1.5634310245513916, + "learning_rate": 0.0002, + "loss": 0.9724, + "step": 6140 + }, + { + "epoch": 8.0, + "eval_loss": 2.37752628326416, + "eval_runtime": 42.8322, + "eval_samples_per_second": 12.024, + "eval_steps_per_second": 1.518, + "step": 6144 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8433056302681293e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-6144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f7270c87aeb034ac54978b05ecba8508494a8007 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ab0e44349fc9228d86d294b2ff8d37e624d1ac18cd5d1e5abb4a3acf7a1f23 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..421954ea7ae82381055ac9d9389eece4a36c4247 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e8ee171d107733df0850802fa7e27b992c38203a2fac1a171d5bbae43bee1e1 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7f59744c433a711c543470bbaa6124cdc61787fd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28ae84bfb19400078314651a7552772a34f16b82b97adea6d76304a56593e8f1 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..202e27f3fc154523582b255ae2557b85dfcfec2f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f2e47af26ae5a1148c80132b801bbd868a113444ce56774416dedff2c3f0b8 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8e933681786d857fe99b6a7e1bd862c4cb19d838 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/trainer_state.json @@ -0,0 +1,573 @@ +{ + "best_metric": 1.8215787410736084, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 768, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013020833333333334, + "grad_norm": 0.513252854347229, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 0.5675475001335144, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 20 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.5074710845947266, + "learning_rate": 0.0002, + "loss": 2.0492, + "step": 30 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 0.7609530687332153, + "learning_rate": 0.0002, + "loss": 2.0109, + "step": 40 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 0.5691684484481812, + "learning_rate": 0.0002, + "loss": 1.8852, + "step": 50 + }, + { + "epoch": 0.078125, + "grad_norm": 0.5346821546554565, + "learning_rate": 0.0002, + "loss": 1.8763, + "step": 60 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 0.46337810158729553, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 70 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 0.4698766767978668, + "learning_rate": 0.0002, + "loss": 1.8124, + "step": 80 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.43780726194381714, + "learning_rate": 0.0002, + "loss": 1.8101, + "step": 90 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 0.9183378219604492, + "learning_rate": 0.0002, + "loss": 1.8044, + "step": 100 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 0.44829392433166504, + "learning_rate": 0.0002, + "loss": 1.9022, + "step": 110 + }, + { + "epoch": 0.15625, + "grad_norm": 0.3734739422798157, + "learning_rate": 0.0002, + "loss": 1.8906, + "step": 120 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 0.4368326663970947, + "learning_rate": 0.0002, + "loss": 1.8302, + "step": 130 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 0.3962480127811432, + "learning_rate": 0.0002, + "loss": 1.898, + "step": 140 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.4569706916809082, + "learning_rate": 0.0002, + "loss": 1.8136, + "step": 150 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.4076327383518219, + "learning_rate": 0.0002, + "loss": 1.8676, + "step": 160 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 0.4026809632778168, + "learning_rate": 0.0002, + "loss": 1.7927, + "step": 170 + }, + { + "epoch": 0.234375, + "grad_norm": 0.40455079078674316, + "learning_rate": 0.0002, + "loss": 1.8999, + "step": 180 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 0.40840157866477966, + "learning_rate": 0.0002, + "loss": 1.8397, + "step": 190 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 0.4101830720901489, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 200 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.3911910057067871, + "learning_rate": 0.0002, + "loss": 1.8106, + "step": 210 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 0.4409257173538208, + "learning_rate": 0.0002, + "loss": 1.8519, + "step": 220 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 0.39020729064941406, + "learning_rate": 0.0002, + "loss": 1.8192, + "step": 230 + }, + { + "epoch": 0.3125, + "grad_norm": 0.4311807155609131, + "learning_rate": 0.0002, + "loss": 1.7586, + "step": 240 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 0.3851333558559418, + "learning_rate": 0.0002, + "loss": 1.7477, + "step": 250 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 0.37738412618637085, + "learning_rate": 0.0002, + "loss": 1.7896, + "step": 260 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.3525104820728302, + "learning_rate": 0.0002, + "loss": 1.783, + "step": 270 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 0.418957382440567, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 280 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 0.40066027641296387, + "learning_rate": 0.0002, + "loss": 1.7989, + "step": 290 + }, + { + "epoch": 0.390625, + "grad_norm": 0.379321813583374, + "learning_rate": 0.0002, + "loss": 1.7294, + "step": 300 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 0.35400667786598206, + "learning_rate": 0.0002, + "loss": 1.869, + "step": 310 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.6621660590171814, + "learning_rate": 0.0002, + "loss": 1.7546, + "step": 320 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.3783826529979706, + "learning_rate": 0.0002, + "loss": 1.8251, + "step": 330 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 0.3920382857322693, + "learning_rate": 0.0002, + "loss": 1.688, + "step": 340 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 0.3657408654689789, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 350 + }, + { + "epoch": 0.46875, + "grad_norm": 0.3717544674873352, + "learning_rate": 0.0002, + "loss": 1.7719, + "step": 360 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 0.33955204486846924, + "learning_rate": 0.0002, + "loss": 1.7863, + "step": 370 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 0.33888939023017883, + "learning_rate": 0.0002, + "loss": 1.7751, + "step": 380 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.3748014271259308, + "learning_rate": 0.0002, + "loss": 1.7366, + "step": 390 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.37372609972953796, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 400 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 0.4089180827140808, + "learning_rate": 0.0002, + "loss": 1.7604, + "step": 410 + }, + { + "epoch": 0.546875, + "grad_norm": 0.38470903038978577, + "learning_rate": 0.0002, + "loss": 1.7767, + "step": 420 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 0.33426186442375183, + "learning_rate": 0.0002, + "loss": 1.814, + "step": 430 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 0.3802422285079956, + "learning_rate": 0.0002, + "loss": 1.6738, + "step": 440 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.3245152533054352, + "learning_rate": 0.0002, + "loss": 1.7983, + "step": 450 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 0.34128233790397644, + "learning_rate": 0.0002, + "loss": 1.7298, + "step": 460 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 0.33154451847076416, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 470 + }, + { + "epoch": 0.625, + "grad_norm": 0.34642690420150757, + "learning_rate": 0.0002, + "loss": 1.7417, + "step": 480 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 0.37599194049835205, + "learning_rate": 0.0002, + "loss": 1.7242, + "step": 490 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 0.4088667333126068, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 500 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.35734823346138, + "learning_rate": 0.0002, + "loss": 1.7216, + "step": 510 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 0.38925203680992126, + "learning_rate": 0.0002, + "loss": 1.8128, + "step": 520 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 0.3787044584751129, + "learning_rate": 0.0002, + "loss": 1.7671, + "step": 530 + }, + { + "epoch": 0.703125, + "grad_norm": 0.35195621848106384, + "learning_rate": 0.0002, + "loss": 1.8375, + "step": 540 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 0.39059996604919434, + "learning_rate": 0.0002, + "loss": 1.7469, + "step": 550 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.5075398683547974, + "learning_rate": 0.0002, + "loss": 1.7351, + "step": 560 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.4286627471446991, + "learning_rate": 0.0002, + "loss": 1.7276, + "step": 570 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 0.33405354619026184, + "learning_rate": 0.0002, + "loss": 1.8418, + "step": 580 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 0.37269648909568787, + "learning_rate": 0.0002, + "loss": 1.7724, + "step": 590 + }, + { + "epoch": 0.78125, + "grad_norm": 0.3618223965167999, + "learning_rate": 0.0002, + "loss": 1.7658, + "step": 600 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 0.33787694573402405, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 610 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 0.4018900990486145, + "learning_rate": 0.0002, + "loss": 1.8033, + "step": 620 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.3892900049686432, + "learning_rate": 0.0002, + "loss": 1.8206, + "step": 630 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.33400827646255493, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 640 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 0.3237822353839874, + "learning_rate": 0.0002, + "loss": 1.7139, + "step": 650 + }, + { + "epoch": 0.859375, + "grad_norm": 0.35551393032073975, + "learning_rate": 0.0002, + "loss": 1.8172, + "step": 660 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 0.38883528113365173, + "learning_rate": 0.0002, + "loss": 1.8265, + "step": 670 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 0.35139647126197815, + "learning_rate": 0.0002, + "loss": 1.7841, + "step": 680 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.3403511941432953, + "learning_rate": 0.0002, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 0.32814469933509827, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 700 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 0.3933236598968506, + "learning_rate": 0.0002, + "loss": 1.7968, + "step": 710 + }, + { + "epoch": 0.9375, + "grad_norm": 0.3436862528324127, + "learning_rate": 0.0002, + "loss": 1.7249, + "step": 720 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 0.32683226466178894, + "learning_rate": 0.0002, + "loss": 1.7717, + "step": 730 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 0.32675468921661377, + "learning_rate": 0.0002, + "loss": 1.7511, + "step": 740 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.371297150850296, + "learning_rate": 0.0002, + "loss": 1.7429, + "step": 750 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 0.39658334851264954, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 760 + }, + { + "epoch": 1.0, + "eval_loss": 1.8215787410736084, + "eval_runtime": 102.4906, + "eval_samples_per_second": 5.025, + "eval_steps_per_second": 0.634, + "step": 768 + } + ], + "logging_steps": 10, + "max_steps": 6144, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.5541320378351616e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f80dde86ba4e618f26a01c223b4deb12abc2573c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e2a27cb20ad8259ead9c902b790583c577f4b154d3f04f1e45e7a3192ebcb +size 5624 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b3e3046ffb02a5038ecb6377daf50de2d1120b0f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 768, "epoch_duration": 2474.8431992530823, "total_accumulated_duration": 2474.8431992530823, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}]} +{"epoch": 2.0, "step": 1536, "epoch_duration": 2470.064612865448, "total_accumulated_duration": 4944.90781211853, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-768", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}, {"eval_loss": 1.8215787410736084, "eval_runtime": 102.4906, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.634, "epoch": 1.0, "step": 768}, {"loss": 1.8072, "grad_norm": 0.303970068693161, "learning_rate": 0.0002, "epoch": 1.0026041666666667, "step": 770}, {"loss": 1.6708, "grad_norm": 0.32745876908302307, "learning_rate": 0.0002, "epoch": 1.015625, "step": 780}, {"loss": 1.623, "grad_norm": 0.33467888832092285, "learning_rate": 0.0002, "epoch": 1.0286458333333333, "step": 790}, {"loss": 1.746, "grad_norm": 0.38253068923950195, "learning_rate": 0.0002, "epoch": 1.0416666666666667, "step": 800}, {"loss": 1.685, "grad_norm": 0.3955802023410797, "learning_rate": 0.0002, "epoch": 1.0546875, "step": 810}, {"loss": 1.7395, "grad_norm": 0.3534117043018341, "learning_rate": 0.0002, "epoch": 1.0677083333333333, "step": 820}, {"loss": 1.6361, "grad_norm": 0.33427858352661133, "learning_rate": 0.0002, "epoch": 1.0807291666666667, "step": 830}, {"loss": 1.7435, "grad_norm": 0.35261571407318115, "learning_rate": 0.0002, "epoch": 1.09375, "step": 840}, {"loss": 1.7112, "grad_norm": 0.4416263997554779, "learning_rate": 0.0002, "epoch": 1.1067708333333333, "step": 850}, {"loss": 1.6311, "grad_norm": 0.3918050229549408, "learning_rate": 0.0002, "epoch": 1.1197916666666667, "step": 860}, {"loss": 1.6804, "grad_norm": 0.38482677936553955, "learning_rate": 0.0002, "epoch": 1.1328125, "step": 870}, {"loss": 1.6951, "grad_norm": 0.4945143759250641, "learning_rate": 0.0002, "epoch": 1.1458333333333333, "step": 880}, {"loss": 1.7577, "grad_norm": 0.429677814245224, "learning_rate": 0.0002, "epoch": 1.1588541666666667, "step": 890}, {"loss": 1.7204, "grad_norm": 0.41878288984298706, "learning_rate": 0.0002, "epoch": 1.171875, "step": 900}, {"loss": 1.717, "grad_norm": 0.41578373312950134, "learning_rate": 0.0002, "epoch": 1.1848958333333333, "step": 910}, {"loss": 1.7017, "grad_norm": 0.37028902769088745, "learning_rate": 0.0002, "epoch": 1.1979166666666667, "step": 920}, {"loss": 1.7074, "grad_norm": 0.3824995756149292, "learning_rate": 0.0002, "epoch": 1.2109375, "step": 930}, {"loss": 1.6185, "grad_norm": 0.3818865418434143, "learning_rate": 0.0002, "epoch": 1.2239583333333333, "step": 940}, {"loss": 1.7894, "grad_norm": 0.3930460810661316, "learning_rate": 0.0002, "epoch": 1.2369791666666667, "step": 950}, {"loss": 1.6766, "grad_norm": 0.3904426395893097, "learning_rate": 0.0002, "epoch": 1.25, "step": 960}, {"loss": 1.7072, "grad_norm": 0.4175802171230316, "learning_rate": 0.0002, "epoch": 1.2630208333333333, "step": 970}, {"loss": 1.7556, "grad_norm": 0.42343786358833313, "learning_rate": 0.0002, "epoch": 1.2760416666666667, "step": 980}, {"loss": 1.6339, "grad_norm": 0.4168420135974884, "learning_rate": 0.0002, "epoch": 1.2890625, "step": 990}, {"loss": 1.727, "grad_norm": 0.38692983984947205, "learning_rate": 0.0002, "epoch": 1.3020833333333333, "step": 1000}, {"loss": 1.6384, "grad_norm": 0.5037692189216614, "learning_rate": 0.0002, "epoch": 1.3151041666666667, "step": 1010}, {"loss": 1.6878, "grad_norm": 0.39436691999435425, "learning_rate": 0.0002, "epoch": 1.328125, "step": 1020}, {"loss": 1.7113, "grad_norm": 0.3431943356990814, "learning_rate": 0.0002, "epoch": 1.3411458333333333, "step": 1030}, {"loss": 1.7034, "grad_norm": 0.39167070388793945, "learning_rate": 0.0002, "epoch": 1.3541666666666667, "step": 1040}, {"loss": 1.7108, "grad_norm": 0.3820446729660034, "learning_rate": 0.0002, "epoch": 1.3671875, "step": 1050}, {"loss": 1.7885, "grad_norm": 0.4190749526023865, "learning_rate": 0.0002, "epoch": 1.3802083333333333, "step": 1060}, {"loss": 1.7548, "grad_norm": 0.3618869185447693, "learning_rate": 0.0002, "epoch": 1.3932291666666667, "step": 1070}, {"loss": 1.6199, "grad_norm": 0.38852423429489136, "learning_rate": 0.0002, "epoch": 1.40625, "step": 1080}, {"loss": 1.733, "grad_norm": 0.49829256534576416, "learning_rate": 0.0002, "epoch": 1.4192708333333333, "step": 1090}, {"loss": 1.6589, "grad_norm": 0.3956700563430786, "learning_rate": 0.0002, "epoch": 1.4322916666666667, "step": 1100}, {"loss": 1.5866, "grad_norm": 0.38829147815704346, "learning_rate": 0.0002, "epoch": 1.4453125, "step": 1110}, {"loss": 1.6709, "grad_norm": 0.37237483263015747, "learning_rate": 0.0002, "epoch": 1.4583333333333333, "step": 1120}, {"loss": 1.64, "grad_norm": 0.39798808097839355, "learning_rate": 0.0002, "epoch": 1.4713541666666667, "step": 1130}, {"loss": 1.7484, "grad_norm": 0.38188642263412476, "learning_rate": 0.0002, "epoch": 1.484375, "step": 1140}, {"loss": 1.6707, "grad_norm": 0.44961944222450256, "learning_rate": 0.0002, "epoch": 1.4973958333333333, "step": 1150}, {"loss": 1.6241, "grad_norm": 0.3816550374031067, "learning_rate": 0.0002, "epoch": 1.5104166666666665, "step": 1160}, {"loss": 1.7606, "grad_norm": 0.3885478973388672, "learning_rate": 0.0002, "epoch": 1.5234375, "step": 1170}, {"loss": 1.7285, "grad_norm": 0.42779695987701416, "learning_rate": 0.0002, "epoch": 1.5364583333333335, "step": 1180}, {"loss": 1.7399, "grad_norm": 0.41499748826026917, "learning_rate": 0.0002, "epoch": 1.5494791666666665, "step": 1190}, {"loss": 1.6569, "grad_norm": 0.4319412410259247, "learning_rate": 0.0002, "epoch": 1.5625, "step": 1200}, {"loss": 1.7297, "grad_norm": 0.38847389817237854, "learning_rate": 0.0002, "epoch": 1.5755208333333335, "step": 1210}, {"loss": 1.6666, "grad_norm": 0.45832890272140503, "learning_rate": 0.0002, "epoch": 1.5885416666666665, "step": 1220}, {"loss": 1.68, "grad_norm": 0.45928797125816345, "learning_rate": 0.0002, "epoch": 1.6015625, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.4052276611328125, "learning_rate": 0.0002, "epoch": 1.6145833333333335, "step": 1240}, {"loss": 1.6722, "grad_norm": 0.4031650424003601, "learning_rate": 0.0002, "epoch": 1.6276041666666665, "step": 1250}, {"loss": 1.7243, "grad_norm": 0.36724114418029785, "learning_rate": 0.0002, "epoch": 1.640625, "step": 1260}, {"loss": 1.7672, "grad_norm": 0.4188505709171295, "learning_rate": 0.0002, "epoch": 1.6536458333333335, "step": 1270}, {"loss": 1.7685, "grad_norm": 0.3982168138027191, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1280}, {"loss": 1.6831, "grad_norm": 0.3768596053123474, "learning_rate": 0.0002, "epoch": 1.6796875, "step": 1290}, {"loss": 1.6868, "grad_norm": 0.3843287527561188, "learning_rate": 0.0002, "epoch": 1.6927083333333335, "step": 1300}, {"loss": 1.6188, "grad_norm": 0.3982345461845398, "learning_rate": 0.0002, "epoch": 1.7057291666666665, "step": 1310}, {"loss": 1.7084, "grad_norm": 0.3407546281814575, "learning_rate": 0.0002, "epoch": 1.71875, "step": 1320}, {"loss": 1.7316, "grad_norm": 0.36327359080314636, "learning_rate": 0.0002, "epoch": 1.7317708333333335, "step": 1330}, {"loss": 1.734, "grad_norm": 0.4141675531864166, "learning_rate": 0.0002, "epoch": 1.7447916666666665, "step": 1340}, {"loss": 1.7257, "grad_norm": 0.43894267082214355, "learning_rate": 0.0002, "epoch": 1.7578125, "step": 1350}, {"loss": 1.6613, "grad_norm": 0.40564292669296265, "learning_rate": 0.0002, "epoch": 1.7708333333333335, "step": 1360}, {"loss": 1.6841, "grad_norm": 0.3978462815284729, "learning_rate": 0.0002, "epoch": 1.7838541666666665, "step": 1370}, {"loss": 1.6497, "grad_norm": 0.37140771746635437, "learning_rate": 0.0002, "epoch": 1.796875, "step": 1380}, {"loss": 1.742, "grad_norm": 0.43164145946502686, "learning_rate": 0.0002, "epoch": 1.8098958333333335, "step": 1390}, {"loss": 1.7253, "grad_norm": 0.38034674525260925, "learning_rate": 0.0002, "epoch": 1.8229166666666665, "step": 1400}, {"loss": 1.652, "grad_norm": 0.4235687851905823, "learning_rate": 0.0002, "epoch": 1.8359375, "step": 1410}, {"loss": 1.752, "grad_norm": 0.37417489290237427, "learning_rate": 0.0002, "epoch": 1.8489583333333335, "step": 1420}, {"loss": 1.6995, "grad_norm": 0.4303789734840393, "learning_rate": 0.0002, "epoch": 1.8619791666666665, "step": 1430}, {"loss": 1.6489, "grad_norm": 0.43942129611968994, "learning_rate": 0.0002, "epoch": 1.875, "step": 1440}, {"loss": 1.7989, "grad_norm": 0.3866581320762634, "learning_rate": 0.0002, "epoch": 1.8880208333333335, "step": 1450}, {"loss": 1.72, "grad_norm": 0.3686903417110443, "learning_rate": 0.0002, "epoch": 1.9010416666666665, "step": 1460}, {"loss": 1.6545, "grad_norm": 0.3885461986064911, "learning_rate": 0.0002, "epoch": 1.9140625, "step": 1470}, {"loss": 1.6981, "grad_norm": 0.4156927466392517, "learning_rate": 0.0002, "epoch": 1.9270833333333335, "step": 1480}, {"loss": 1.5921, "grad_norm": 0.3934236168861389, "learning_rate": 0.0002, "epoch": 1.9401041666666665, "step": 1490}, {"loss": 1.7384, "grad_norm": 0.38645586371421814, "learning_rate": 0.0002, "epoch": 1.953125, "step": 1500}, {"loss": 1.7033, "grad_norm": 0.43272635340690613, "learning_rate": 0.0002, "epoch": 1.9661458333333335, "step": 1510}, {"loss": 1.6138, "grad_norm": 0.42476025223731995, "learning_rate": 0.0002, "epoch": 1.9791666666666665, "step": 1520}, {"loss": 1.5834, "grad_norm": 0.37216147780418396, "learning_rate": 0.0002, "epoch": 1.9921875, "step": 1530}]} +{"epoch": 3.0, "step": 2304, "epoch_duration": 2310.4165902137756, "total_accumulated_duration": 7255.324402332306, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}, {"eval_loss": 1.8215787410736084, "eval_runtime": 102.4906, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.634, "epoch": 1.0, "step": 768}, {"loss": 1.8072, "grad_norm": 0.303970068693161, "learning_rate": 0.0002, "epoch": 1.0026041666666667, "step": 770}, {"loss": 1.6708, "grad_norm": 0.32745876908302307, "learning_rate": 0.0002, "epoch": 1.015625, "step": 780}, {"loss": 1.623, "grad_norm": 0.33467888832092285, "learning_rate": 0.0002, "epoch": 1.0286458333333333, "step": 790}, {"loss": 1.746, "grad_norm": 0.38253068923950195, "learning_rate": 0.0002, "epoch": 1.0416666666666667, "step": 800}, {"loss": 1.685, "grad_norm": 0.3955802023410797, "learning_rate": 0.0002, "epoch": 1.0546875, "step": 810}, {"loss": 1.7395, "grad_norm": 0.3534117043018341, "learning_rate": 0.0002, "epoch": 1.0677083333333333, "step": 820}, {"loss": 1.6361, "grad_norm": 0.33427858352661133, "learning_rate": 0.0002, "epoch": 1.0807291666666667, "step": 830}, {"loss": 1.7435, "grad_norm": 0.35261571407318115, "learning_rate": 0.0002, "epoch": 1.09375, "step": 840}, {"loss": 1.7112, "grad_norm": 0.4416263997554779, "learning_rate": 0.0002, "epoch": 1.1067708333333333, "step": 850}, {"loss": 1.6311, "grad_norm": 0.3918050229549408, "learning_rate": 0.0002, "epoch": 1.1197916666666667, "step": 860}, {"loss": 1.6804, "grad_norm": 0.38482677936553955, "learning_rate": 0.0002, "epoch": 1.1328125, "step": 870}, {"loss": 1.6951, "grad_norm": 0.4945143759250641, "learning_rate": 0.0002, "epoch": 1.1458333333333333, "step": 880}, {"loss": 1.7577, "grad_norm": 0.429677814245224, "learning_rate": 0.0002, "epoch": 1.1588541666666667, "step": 890}, {"loss": 1.7204, "grad_norm": 0.41878288984298706, "learning_rate": 0.0002, "epoch": 1.171875, "step": 900}, {"loss": 1.717, "grad_norm": 0.41578373312950134, "learning_rate": 0.0002, "epoch": 1.1848958333333333, "step": 910}, {"loss": 1.7017, "grad_norm": 0.37028902769088745, "learning_rate": 0.0002, "epoch": 1.1979166666666667, "step": 920}, {"loss": 1.7074, "grad_norm": 0.3824995756149292, "learning_rate": 0.0002, "epoch": 1.2109375, "step": 930}, {"loss": 1.6185, "grad_norm": 0.3818865418434143, "learning_rate": 0.0002, "epoch": 1.2239583333333333, "step": 940}, {"loss": 1.7894, "grad_norm": 0.3930460810661316, "learning_rate": 0.0002, "epoch": 1.2369791666666667, "step": 950}, {"loss": 1.6766, "grad_norm": 0.3904426395893097, "learning_rate": 0.0002, "epoch": 1.25, "step": 960}, {"loss": 1.7072, "grad_norm": 0.4175802171230316, "learning_rate": 0.0002, "epoch": 1.2630208333333333, "step": 970}, {"loss": 1.7556, "grad_norm": 0.42343786358833313, "learning_rate": 0.0002, "epoch": 1.2760416666666667, "step": 980}, {"loss": 1.6339, "grad_norm": 0.4168420135974884, "learning_rate": 0.0002, "epoch": 1.2890625, "step": 990}, {"loss": 1.727, "grad_norm": 0.38692983984947205, "learning_rate": 0.0002, "epoch": 1.3020833333333333, "step": 1000}, {"loss": 1.6384, "grad_norm": 0.5037692189216614, "learning_rate": 0.0002, "epoch": 1.3151041666666667, "step": 1010}, {"loss": 1.6878, "grad_norm": 0.39436691999435425, "learning_rate": 0.0002, "epoch": 1.328125, "step": 1020}, {"loss": 1.7113, "grad_norm": 0.3431943356990814, "learning_rate": 0.0002, "epoch": 1.3411458333333333, "step": 1030}, {"loss": 1.7034, "grad_norm": 0.39167070388793945, "learning_rate": 0.0002, "epoch": 1.3541666666666667, "step": 1040}, {"loss": 1.7108, "grad_norm": 0.3820446729660034, "learning_rate": 0.0002, "epoch": 1.3671875, "step": 1050}, {"loss": 1.7885, "grad_norm": 0.4190749526023865, "learning_rate": 0.0002, "epoch": 1.3802083333333333, "step": 1060}, {"loss": 1.7548, "grad_norm": 0.3618869185447693, "learning_rate": 0.0002, "epoch": 1.3932291666666667, "step": 1070}, {"loss": 1.6199, "grad_norm": 0.38852423429489136, "learning_rate": 0.0002, "epoch": 1.40625, "step": 1080}, {"loss": 1.733, "grad_norm": 0.49829256534576416, "learning_rate": 0.0002, "epoch": 1.4192708333333333, "step": 1090}, {"loss": 1.6589, "grad_norm": 0.3956700563430786, "learning_rate": 0.0002, "epoch": 1.4322916666666667, "step": 1100}, {"loss": 1.5866, "grad_norm": 0.38829147815704346, "learning_rate": 0.0002, "epoch": 1.4453125, "step": 1110}, {"loss": 1.6709, "grad_norm": 0.37237483263015747, "learning_rate": 0.0002, "epoch": 1.4583333333333333, "step": 1120}, {"loss": 1.64, "grad_norm": 0.39798808097839355, "learning_rate": 0.0002, "epoch": 1.4713541666666667, "step": 1130}, {"loss": 1.7484, "grad_norm": 0.38188642263412476, "learning_rate": 0.0002, "epoch": 1.484375, "step": 1140}, {"loss": 1.6707, "grad_norm": 0.44961944222450256, "learning_rate": 0.0002, "epoch": 1.4973958333333333, "step": 1150}, {"loss": 1.6241, "grad_norm": 0.3816550374031067, "learning_rate": 0.0002, "epoch": 1.5104166666666665, "step": 1160}, {"loss": 1.7606, "grad_norm": 0.3885478973388672, "learning_rate": 0.0002, "epoch": 1.5234375, "step": 1170}, {"loss": 1.7285, "grad_norm": 0.42779695987701416, "learning_rate": 0.0002, "epoch": 1.5364583333333335, "step": 1180}, {"loss": 1.7399, "grad_norm": 0.41499748826026917, "learning_rate": 0.0002, "epoch": 1.5494791666666665, "step": 1190}, {"loss": 1.6569, "grad_norm": 0.4319412410259247, "learning_rate": 0.0002, "epoch": 1.5625, "step": 1200}, {"loss": 1.7297, "grad_norm": 0.38847389817237854, "learning_rate": 0.0002, "epoch": 1.5755208333333335, "step": 1210}, {"loss": 1.6666, "grad_norm": 0.45832890272140503, "learning_rate": 0.0002, "epoch": 1.5885416666666665, "step": 1220}, {"loss": 1.68, "grad_norm": 0.45928797125816345, "learning_rate": 0.0002, "epoch": 1.6015625, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.4052276611328125, "learning_rate": 0.0002, "epoch": 1.6145833333333335, "step": 1240}, {"loss": 1.6722, "grad_norm": 0.4031650424003601, "learning_rate": 0.0002, "epoch": 1.6276041666666665, "step": 1250}, {"loss": 1.7243, "grad_norm": 0.36724114418029785, "learning_rate": 0.0002, "epoch": 1.640625, "step": 1260}, {"loss": 1.7672, "grad_norm": 0.4188505709171295, "learning_rate": 0.0002, "epoch": 1.6536458333333335, "step": 1270}, {"loss": 1.7685, "grad_norm": 0.3982168138027191, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1280}, {"loss": 1.6831, "grad_norm": 0.3768596053123474, "learning_rate": 0.0002, "epoch": 1.6796875, "step": 1290}, {"loss": 1.6868, "grad_norm": 0.3843287527561188, "learning_rate": 0.0002, "epoch": 1.6927083333333335, "step": 1300}, {"loss": 1.6188, "grad_norm": 0.3982345461845398, "learning_rate": 0.0002, "epoch": 1.7057291666666665, "step": 1310}, {"loss": 1.7084, "grad_norm": 0.3407546281814575, "learning_rate": 0.0002, "epoch": 1.71875, "step": 1320}, {"loss": 1.7316, "grad_norm": 0.36327359080314636, "learning_rate": 0.0002, "epoch": 1.7317708333333335, "step": 1330}, {"loss": 1.734, "grad_norm": 0.4141675531864166, "learning_rate": 0.0002, "epoch": 1.7447916666666665, "step": 1340}, {"loss": 1.7257, "grad_norm": 0.43894267082214355, "learning_rate": 0.0002, "epoch": 1.7578125, "step": 1350}, {"loss": 1.6613, "grad_norm": 0.40564292669296265, "learning_rate": 0.0002, "epoch": 1.7708333333333335, "step": 1360}, {"loss": 1.6841, "grad_norm": 0.3978462815284729, "learning_rate": 0.0002, "epoch": 1.7838541666666665, "step": 1370}, {"loss": 1.6497, "grad_norm": 0.37140771746635437, "learning_rate": 0.0002, "epoch": 1.796875, "step": 1380}, {"loss": 1.742, "grad_norm": 0.43164145946502686, "learning_rate": 0.0002, "epoch": 1.8098958333333335, "step": 1390}, {"loss": 1.7253, "grad_norm": 0.38034674525260925, "learning_rate": 0.0002, "epoch": 1.8229166666666665, "step": 1400}, {"loss": 1.652, "grad_norm": 0.4235687851905823, "learning_rate": 0.0002, "epoch": 1.8359375, "step": 1410}, {"loss": 1.752, "grad_norm": 0.37417489290237427, "learning_rate": 0.0002, "epoch": 1.8489583333333335, "step": 1420}, {"loss": 1.6995, "grad_norm": 0.4303789734840393, "learning_rate": 0.0002, "epoch": 1.8619791666666665, "step": 1430}, {"loss": 1.6489, "grad_norm": 0.43942129611968994, "learning_rate": 0.0002, "epoch": 1.875, "step": 1440}, {"loss": 1.7989, "grad_norm": 0.3866581320762634, "learning_rate": 0.0002, "epoch": 1.8880208333333335, "step": 1450}, {"loss": 1.72, "grad_norm": 0.3686903417110443, "learning_rate": 0.0002, "epoch": 1.9010416666666665, "step": 1460}, {"loss": 1.6545, "grad_norm": 0.3885461986064911, "learning_rate": 0.0002, "epoch": 1.9140625, "step": 1470}, {"loss": 1.6981, "grad_norm": 0.4156927466392517, "learning_rate": 0.0002, "epoch": 1.9270833333333335, "step": 1480}, {"loss": 1.5921, "grad_norm": 0.3934236168861389, "learning_rate": 0.0002, "epoch": 1.9401041666666665, "step": 1490}, {"loss": 1.7384, "grad_norm": 0.38645586371421814, "learning_rate": 0.0002, "epoch": 1.953125, "step": 1500}, {"loss": 1.7033, "grad_norm": 0.43272635340690613, "learning_rate": 0.0002, "epoch": 1.9661458333333335, "step": 1510}, {"loss": 1.6138, "grad_norm": 0.42476025223731995, "learning_rate": 0.0002, "epoch": 1.9791666666666665, "step": 1520}, {"loss": 1.5834, "grad_norm": 0.37216147780418396, "learning_rate": 0.0002, "epoch": 1.9921875, "step": 1530}, {"eval_loss": 1.820037841796875, "eval_runtime": 101.0456, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.643, "epoch": 2.0, "step": 1536}, {"loss": 1.6395, "grad_norm": 0.39003029465675354, "learning_rate": 0.0002, "epoch": 2.0052083333333335, "step": 1540}, {"loss": 1.5447, "grad_norm": 0.4302637577056885, "learning_rate": 0.0002, "epoch": 2.0182291666666665, "step": 1550}, {"loss": 1.5951, "grad_norm": 0.4496043026447296, "learning_rate": 0.0002, "epoch": 2.03125, "step": 1560}, {"loss": 1.6032, "grad_norm": 0.42824679613113403, "learning_rate": 0.0002, "epoch": 2.0442708333333335, "step": 1570}, {"loss": 1.5996, "grad_norm": 0.44775739312171936, "learning_rate": 0.0002, "epoch": 2.0572916666666665, "step": 1580}, {"loss": 1.571, "grad_norm": 0.4705299735069275, "learning_rate": 0.0002, "epoch": 2.0703125, "step": 1590}, {"loss": 1.7589, "grad_norm": 0.4614814817905426, "learning_rate": 0.0002, "epoch": 2.0833333333333335, "step": 1600}, {"loss": 1.5762, "grad_norm": 0.45097213983535767, "learning_rate": 0.0002, "epoch": 2.0963541666666665, "step": 1610}, {"loss": 1.4947, "grad_norm": 0.41954323649406433, "learning_rate": 0.0002, "epoch": 2.109375, "step": 1620}, {"loss": 1.6397, "grad_norm": 0.44894352555274963, "learning_rate": 0.0002, "epoch": 2.1223958333333335, "step": 1630}, {"loss": 1.5251, "grad_norm": 0.4421502947807312, "learning_rate": 0.0002, "epoch": 2.1354166666666665, "step": 1640}, {"loss": 1.5931, "grad_norm": 0.44649967551231384, "learning_rate": 0.0002, "epoch": 2.1484375, "step": 1650}, {"loss": 1.6327, "grad_norm": 0.44216716289520264, "learning_rate": 0.0002, "epoch": 2.1614583333333335, "step": 1660}, {"loss": 1.5924, "grad_norm": 0.6363232135772705, "learning_rate": 0.0002, "epoch": 2.1744791666666665, "step": 1670}, {"loss": 1.6151, "grad_norm": 0.46533334255218506, "learning_rate": 0.0002, "epoch": 2.1875, "step": 1680}, {"loss": 1.5539, "grad_norm": 0.48486822843551636, "learning_rate": 0.0002, "epoch": 2.2005208333333335, "step": 1690}, {"loss": 1.6322, "grad_norm": 0.43277066946029663, "learning_rate": 0.0002, "epoch": 2.2135416666666665, "step": 1700}, {"loss": 1.4979, "grad_norm": 0.45927226543426514, "learning_rate": 0.0002, "epoch": 2.2265625, "step": 1710}, {"loss": 1.5917, "grad_norm": 0.4654010236263275, "learning_rate": 0.0002, "epoch": 2.2395833333333335, "step": 1720}, {"loss": 1.5713, "grad_norm": 0.49796584248542786, "learning_rate": 0.0002, "epoch": 2.2526041666666665, "step": 1730}, {"loss": 1.587, "grad_norm": 0.4506736397743225, "learning_rate": 0.0002, "epoch": 2.265625, "step": 1740}, {"loss": 1.5961, "grad_norm": 0.46757954359054565, "learning_rate": 0.0002, "epoch": 2.2786458333333335, "step": 1750}, {"loss": 1.6307, "grad_norm": 0.4507335424423218, "learning_rate": 0.0002, "epoch": 2.2916666666666665, "step": 1760}, {"loss": 1.5905, "grad_norm": 0.43900197744369507, "learning_rate": 0.0002, "epoch": 2.3046875, "step": 1770}, {"loss": 1.6655, "grad_norm": 0.48013004660606384, "learning_rate": 0.0002, "epoch": 2.3177083333333335, "step": 1780}, {"loss": 1.6024, "grad_norm": 0.41891220211982727, "learning_rate": 0.0002, "epoch": 2.3307291666666665, "step": 1790}, {"loss": 1.658, "grad_norm": 0.4879191219806671, "learning_rate": 0.0002, "epoch": 2.34375, "step": 1800}, {"loss": 1.6084, "grad_norm": 0.46148231625556946, "learning_rate": 0.0002, "epoch": 2.3567708333333335, "step": 1810}, {"loss": 1.6072, "grad_norm": 0.5114223957061768, "learning_rate": 0.0002, "epoch": 2.3697916666666665, "step": 1820}, {"loss": 1.5505, "grad_norm": 0.4828612804412842, "learning_rate": 0.0002, "epoch": 2.3828125, "step": 1830}, {"loss": 1.571, "grad_norm": 0.4672335386276245, "learning_rate": 0.0002, "epoch": 2.3958333333333335, "step": 1840}, {"loss": 1.6156, "grad_norm": 0.4914792776107788, "learning_rate": 0.0002, "epoch": 2.4088541666666665, "step": 1850}, {"loss": 1.5356, "grad_norm": 0.44478079676628113, "learning_rate": 0.0002, "epoch": 2.421875, "step": 1860}, {"loss": 1.7262, "grad_norm": 0.4601325988769531, "learning_rate": 0.0002, "epoch": 2.4348958333333335, "step": 1870}, {"loss": 1.555, "grad_norm": 0.44539815187454224, "learning_rate": 0.0002, "epoch": 2.4479166666666665, "step": 1880}, {"loss": 1.5877, "grad_norm": 0.4532422125339508, "learning_rate": 0.0002, "epoch": 2.4609375, "step": 1890}, {"loss": 1.5574, "grad_norm": 0.5323562622070312, "learning_rate": 0.0002, "epoch": 2.4739583333333335, "step": 1900}, {"loss": 1.7014, "grad_norm": 0.5027516484260559, "learning_rate": 0.0002, "epoch": 2.4869791666666665, "step": 1910}, {"loss": 1.5471, "grad_norm": 0.4507808983325958, "learning_rate": 0.0002, "epoch": 2.5, "step": 1920}, {"loss": 1.613, "grad_norm": 0.4996422827243805, "learning_rate": 0.0002, "epoch": 2.5130208333333335, "step": 1930}, {"loss": 1.6412, "grad_norm": 0.4964800179004669, "learning_rate": 0.0002, "epoch": 2.5260416666666665, "step": 1940}, {"loss": 1.547, "grad_norm": 0.48546481132507324, "learning_rate": 0.0002, "epoch": 2.5390625, "step": 1950}, {"loss": 1.6075, "grad_norm": 0.47357916831970215, "learning_rate": 0.0002, "epoch": 2.5520833333333335, "step": 1960}, {"loss": 1.5585, "grad_norm": 0.47136595845222473, "learning_rate": 0.0002, "epoch": 2.5651041666666665, "step": 1970}, {"loss": 1.5157, "grad_norm": 0.5185502171516418, "learning_rate": 0.0002, "epoch": 2.578125, "step": 1980}, {"loss": 1.6904, "grad_norm": 0.47995880246162415, "learning_rate": 0.0002, "epoch": 2.5911458333333335, "step": 1990}, {"loss": 1.638, "grad_norm": 0.5076674222946167, "learning_rate": 0.0002, "epoch": 2.6041666666666665, "step": 2000}, {"loss": 1.6038, "grad_norm": 0.4805421233177185, "learning_rate": 0.0002, "epoch": 2.6171875, "step": 2010}, {"loss": 1.6092, "grad_norm": 0.4406864047050476, "learning_rate": 0.0002, "epoch": 2.6302083333333335, "step": 2020}, {"loss": 1.6036, "grad_norm": 0.521388828754425, "learning_rate": 0.0002, "epoch": 2.6432291666666665, "step": 2030}, {"loss": 1.5338, "grad_norm": 0.4531918466091156, "learning_rate": 0.0002, "epoch": 2.65625, "step": 2040}, {"loss": 1.6853, "grad_norm": 0.45295774936676025, "learning_rate": 0.0002, "epoch": 2.6692708333333335, "step": 2050}, {"loss": 1.5252, "grad_norm": 0.4573723375797272, "learning_rate": 0.0002, "epoch": 2.6822916666666665, "step": 2060}, {"loss": 1.5765, "grad_norm": 0.4836064279079437, "learning_rate": 0.0002, "epoch": 2.6953125, "step": 2070}, {"loss": 1.5928, "grad_norm": 0.5040885210037231, "learning_rate": 0.0002, "epoch": 2.7083333333333335, "step": 2080}, {"loss": 1.6438, "grad_norm": 0.5153458118438721, "learning_rate": 0.0002, "epoch": 2.7213541666666665, "step": 2090}, {"loss": 1.5917, "grad_norm": 0.4415692090988159, "learning_rate": 0.0002, "epoch": 2.734375, "step": 2100}, {"loss": 1.6017, "grad_norm": 0.4862712621688843, "learning_rate": 0.0002, "epoch": 2.7473958333333335, "step": 2110}, {"loss": 1.5797, "grad_norm": 0.4845922589302063, "learning_rate": 0.0002, "epoch": 2.7604166666666665, "step": 2120}, {"loss": 1.6404, "grad_norm": 0.5153566598892212, "learning_rate": 0.0002, "epoch": 2.7734375, "step": 2130}, {"loss": 1.5609, "grad_norm": 0.4220491945743561, "learning_rate": 0.0002, "epoch": 2.7864583333333335, "step": 2140}, {"loss": 1.5404, "grad_norm": 0.523292064666748, "learning_rate": 0.0002, "epoch": 2.7994791666666665, "step": 2150}, {"loss": 1.4993, "grad_norm": 0.4567972421646118, "learning_rate": 0.0002, "epoch": 2.8125, "step": 2160}, {"loss": 1.6279, "grad_norm": 0.6252557039260864, "learning_rate": 0.0002, "epoch": 2.8255208333333335, "step": 2170}, {"loss": 1.6203, "grad_norm": 0.5231373310089111, "learning_rate": 0.0002, "epoch": 2.8385416666666665, "step": 2180}, {"loss": 1.5707, "grad_norm": 0.49243974685668945, "learning_rate": 0.0002, "epoch": 2.8515625, "step": 2190}, {"loss": 1.5923, "grad_norm": 0.521644115447998, "learning_rate": 0.0002, "epoch": 2.8645833333333335, "step": 2200}, {"loss": 1.6812, "grad_norm": 0.4624195694923401, "learning_rate": 0.0002, "epoch": 2.8776041666666665, "step": 2210}, {"loss": 1.6132, "grad_norm": 0.4463620185852051, "learning_rate": 0.0002, "epoch": 2.890625, "step": 2220}, {"loss": 1.6095, "grad_norm": 0.45793524384498596, "learning_rate": 0.0002, "epoch": 2.9036458333333335, "step": 2230}, {"loss": 1.5985, "grad_norm": 0.46979188919067383, "learning_rate": 0.0002, "epoch": 2.9166666666666665, "step": 2240}, {"loss": 1.617, "grad_norm": 0.5220303535461426, "learning_rate": 0.0002, "epoch": 2.9296875, "step": 2250}, {"loss": 1.5978, "grad_norm": 0.44405895471572876, "learning_rate": 0.0002, "epoch": 2.9427083333333335, "step": 2260}, {"loss": 1.6685, "grad_norm": 0.523841381072998, "learning_rate": 0.0002, "epoch": 2.9557291666666665, "step": 2270}, {"loss": 1.595, "grad_norm": 0.4928138852119446, "learning_rate": 0.0002, "epoch": 2.96875, "step": 2280}, {"loss": 1.606, "grad_norm": 0.4918071925640106, "learning_rate": 0.0002, "epoch": 2.9817708333333335, "step": 2290}, {"loss": 1.5736, "grad_norm": 0.4584912061691284, "learning_rate": 0.0002, "epoch": 2.9947916666666665, "step": 2300}]} +{"epoch": 4.0, "step": 3072, "epoch_duration": 2457.4396193027496, "total_accumulated_duration": 9712.764021635056, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}, {"eval_loss": 1.8215787410736084, "eval_runtime": 102.4906, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.634, "epoch": 1.0, "step": 768}, {"loss": 1.8072, "grad_norm": 0.303970068693161, "learning_rate": 0.0002, "epoch": 1.0026041666666667, "step": 770}, {"loss": 1.6708, "grad_norm": 0.32745876908302307, "learning_rate": 0.0002, "epoch": 1.015625, "step": 780}, {"loss": 1.623, "grad_norm": 0.33467888832092285, "learning_rate": 0.0002, "epoch": 1.0286458333333333, "step": 790}, {"loss": 1.746, "grad_norm": 0.38253068923950195, "learning_rate": 0.0002, "epoch": 1.0416666666666667, "step": 800}, {"loss": 1.685, "grad_norm": 0.3955802023410797, "learning_rate": 0.0002, "epoch": 1.0546875, "step": 810}, {"loss": 1.7395, "grad_norm": 0.3534117043018341, "learning_rate": 0.0002, "epoch": 1.0677083333333333, "step": 820}, {"loss": 1.6361, "grad_norm": 0.33427858352661133, "learning_rate": 0.0002, "epoch": 1.0807291666666667, "step": 830}, {"loss": 1.7435, "grad_norm": 0.35261571407318115, "learning_rate": 0.0002, "epoch": 1.09375, "step": 840}, {"loss": 1.7112, "grad_norm": 0.4416263997554779, "learning_rate": 0.0002, "epoch": 1.1067708333333333, "step": 850}, {"loss": 1.6311, "grad_norm": 0.3918050229549408, "learning_rate": 0.0002, "epoch": 1.1197916666666667, "step": 860}, {"loss": 1.6804, "grad_norm": 0.38482677936553955, "learning_rate": 0.0002, "epoch": 1.1328125, "step": 870}, {"loss": 1.6951, "grad_norm": 0.4945143759250641, "learning_rate": 0.0002, "epoch": 1.1458333333333333, "step": 880}, {"loss": 1.7577, "grad_norm": 0.429677814245224, "learning_rate": 0.0002, "epoch": 1.1588541666666667, "step": 890}, {"loss": 1.7204, "grad_norm": 0.41878288984298706, "learning_rate": 0.0002, "epoch": 1.171875, "step": 900}, {"loss": 1.717, "grad_norm": 0.41578373312950134, "learning_rate": 0.0002, "epoch": 1.1848958333333333, "step": 910}, {"loss": 1.7017, "grad_norm": 0.37028902769088745, "learning_rate": 0.0002, "epoch": 1.1979166666666667, "step": 920}, {"loss": 1.7074, "grad_norm": 0.3824995756149292, "learning_rate": 0.0002, "epoch": 1.2109375, "step": 930}, {"loss": 1.6185, "grad_norm": 0.3818865418434143, "learning_rate": 0.0002, "epoch": 1.2239583333333333, "step": 940}, {"loss": 1.7894, "grad_norm": 0.3930460810661316, "learning_rate": 0.0002, "epoch": 1.2369791666666667, "step": 950}, {"loss": 1.6766, "grad_norm": 0.3904426395893097, "learning_rate": 0.0002, "epoch": 1.25, "step": 960}, {"loss": 1.7072, "grad_norm": 0.4175802171230316, "learning_rate": 0.0002, "epoch": 1.2630208333333333, "step": 970}, {"loss": 1.7556, "grad_norm": 0.42343786358833313, "learning_rate": 0.0002, "epoch": 1.2760416666666667, "step": 980}, {"loss": 1.6339, "grad_norm": 0.4168420135974884, "learning_rate": 0.0002, "epoch": 1.2890625, "step": 990}, {"loss": 1.727, "grad_norm": 0.38692983984947205, "learning_rate": 0.0002, "epoch": 1.3020833333333333, "step": 1000}, {"loss": 1.6384, "grad_norm": 0.5037692189216614, "learning_rate": 0.0002, "epoch": 1.3151041666666667, "step": 1010}, {"loss": 1.6878, "grad_norm": 0.39436691999435425, "learning_rate": 0.0002, "epoch": 1.328125, "step": 1020}, {"loss": 1.7113, "grad_norm": 0.3431943356990814, "learning_rate": 0.0002, "epoch": 1.3411458333333333, "step": 1030}, {"loss": 1.7034, "grad_norm": 0.39167070388793945, "learning_rate": 0.0002, "epoch": 1.3541666666666667, "step": 1040}, {"loss": 1.7108, "grad_norm": 0.3820446729660034, "learning_rate": 0.0002, "epoch": 1.3671875, "step": 1050}, {"loss": 1.7885, "grad_norm": 0.4190749526023865, "learning_rate": 0.0002, "epoch": 1.3802083333333333, "step": 1060}, {"loss": 1.7548, "grad_norm": 0.3618869185447693, "learning_rate": 0.0002, "epoch": 1.3932291666666667, "step": 1070}, {"loss": 1.6199, "grad_norm": 0.38852423429489136, "learning_rate": 0.0002, "epoch": 1.40625, "step": 1080}, {"loss": 1.733, "grad_norm": 0.49829256534576416, "learning_rate": 0.0002, "epoch": 1.4192708333333333, "step": 1090}, {"loss": 1.6589, "grad_norm": 0.3956700563430786, "learning_rate": 0.0002, "epoch": 1.4322916666666667, "step": 1100}, {"loss": 1.5866, "grad_norm": 0.38829147815704346, "learning_rate": 0.0002, "epoch": 1.4453125, "step": 1110}, {"loss": 1.6709, "grad_norm": 0.37237483263015747, "learning_rate": 0.0002, "epoch": 1.4583333333333333, "step": 1120}, {"loss": 1.64, "grad_norm": 0.39798808097839355, "learning_rate": 0.0002, "epoch": 1.4713541666666667, "step": 1130}, {"loss": 1.7484, "grad_norm": 0.38188642263412476, "learning_rate": 0.0002, "epoch": 1.484375, "step": 1140}, {"loss": 1.6707, "grad_norm": 0.44961944222450256, "learning_rate": 0.0002, "epoch": 1.4973958333333333, "step": 1150}, {"loss": 1.6241, "grad_norm": 0.3816550374031067, "learning_rate": 0.0002, "epoch": 1.5104166666666665, "step": 1160}, {"loss": 1.7606, "grad_norm": 0.3885478973388672, "learning_rate": 0.0002, "epoch": 1.5234375, "step": 1170}, {"loss": 1.7285, "grad_norm": 0.42779695987701416, "learning_rate": 0.0002, "epoch": 1.5364583333333335, "step": 1180}, {"loss": 1.7399, "grad_norm": 0.41499748826026917, "learning_rate": 0.0002, "epoch": 1.5494791666666665, "step": 1190}, {"loss": 1.6569, "grad_norm": 0.4319412410259247, "learning_rate": 0.0002, "epoch": 1.5625, "step": 1200}, {"loss": 1.7297, "grad_norm": 0.38847389817237854, "learning_rate": 0.0002, "epoch": 1.5755208333333335, "step": 1210}, {"loss": 1.6666, "grad_norm": 0.45832890272140503, "learning_rate": 0.0002, "epoch": 1.5885416666666665, "step": 1220}, {"loss": 1.68, "grad_norm": 0.45928797125816345, "learning_rate": 0.0002, "epoch": 1.6015625, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.4052276611328125, "learning_rate": 0.0002, "epoch": 1.6145833333333335, "step": 1240}, {"loss": 1.6722, "grad_norm": 0.4031650424003601, "learning_rate": 0.0002, "epoch": 1.6276041666666665, "step": 1250}, {"loss": 1.7243, "grad_norm": 0.36724114418029785, "learning_rate": 0.0002, "epoch": 1.640625, "step": 1260}, {"loss": 1.7672, "grad_norm": 0.4188505709171295, "learning_rate": 0.0002, "epoch": 1.6536458333333335, "step": 1270}, {"loss": 1.7685, "grad_norm": 0.3982168138027191, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1280}, {"loss": 1.6831, "grad_norm": 0.3768596053123474, "learning_rate": 0.0002, "epoch": 1.6796875, "step": 1290}, {"loss": 1.6868, "grad_norm": 0.3843287527561188, "learning_rate": 0.0002, "epoch": 1.6927083333333335, "step": 1300}, {"loss": 1.6188, "grad_norm": 0.3982345461845398, "learning_rate": 0.0002, "epoch": 1.7057291666666665, "step": 1310}, {"loss": 1.7084, "grad_norm": 0.3407546281814575, "learning_rate": 0.0002, "epoch": 1.71875, "step": 1320}, {"loss": 1.7316, "grad_norm": 0.36327359080314636, "learning_rate": 0.0002, "epoch": 1.7317708333333335, "step": 1330}, {"loss": 1.734, "grad_norm": 0.4141675531864166, "learning_rate": 0.0002, "epoch": 1.7447916666666665, "step": 1340}, {"loss": 1.7257, "grad_norm": 0.43894267082214355, "learning_rate": 0.0002, "epoch": 1.7578125, "step": 1350}, {"loss": 1.6613, "grad_norm": 0.40564292669296265, "learning_rate": 0.0002, "epoch": 1.7708333333333335, "step": 1360}, {"loss": 1.6841, "grad_norm": 0.3978462815284729, "learning_rate": 0.0002, "epoch": 1.7838541666666665, "step": 1370}, {"loss": 1.6497, "grad_norm": 0.37140771746635437, "learning_rate": 0.0002, "epoch": 1.796875, "step": 1380}, {"loss": 1.742, "grad_norm": 0.43164145946502686, "learning_rate": 0.0002, "epoch": 1.8098958333333335, "step": 1390}, {"loss": 1.7253, "grad_norm": 0.38034674525260925, "learning_rate": 0.0002, "epoch": 1.8229166666666665, "step": 1400}, {"loss": 1.652, "grad_norm": 0.4235687851905823, "learning_rate": 0.0002, "epoch": 1.8359375, "step": 1410}, {"loss": 1.752, "grad_norm": 0.37417489290237427, "learning_rate": 0.0002, "epoch": 1.8489583333333335, "step": 1420}, {"loss": 1.6995, "grad_norm": 0.4303789734840393, "learning_rate": 0.0002, "epoch": 1.8619791666666665, "step": 1430}, {"loss": 1.6489, "grad_norm": 0.43942129611968994, "learning_rate": 0.0002, "epoch": 1.875, "step": 1440}, {"loss": 1.7989, "grad_norm": 0.3866581320762634, "learning_rate": 0.0002, "epoch": 1.8880208333333335, "step": 1450}, {"loss": 1.72, "grad_norm": 0.3686903417110443, "learning_rate": 0.0002, "epoch": 1.9010416666666665, "step": 1460}, {"loss": 1.6545, "grad_norm": 0.3885461986064911, "learning_rate": 0.0002, "epoch": 1.9140625, "step": 1470}, {"loss": 1.6981, "grad_norm": 0.4156927466392517, "learning_rate": 0.0002, "epoch": 1.9270833333333335, "step": 1480}, {"loss": 1.5921, "grad_norm": 0.3934236168861389, "learning_rate": 0.0002, "epoch": 1.9401041666666665, "step": 1490}, {"loss": 1.7384, "grad_norm": 0.38645586371421814, "learning_rate": 0.0002, "epoch": 1.953125, "step": 1500}, {"loss": 1.7033, "grad_norm": 0.43272635340690613, "learning_rate": 0.0002, "epoch": 1.9661458333333335, "step": 1510}, {"loss": 1.6138, "grad_norm": 0.42476025223731995, "learning_rate": 0.0002, "epoch": 1.9791666666666665, "step": 1520}, {"loss": 1.5834, "grad_norm": 0.37216147780418396, "learning_rate": 0.0002, "epoch": 1.9921875, "step": 1530}, {"eval_loss": 1.820037841796875, "eval_runtime": 101.0456, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.643, "epoch": 2.0, "step": 1536}, {"loss": 1.6395, "grad_norm": 0.39003029465675354, "learning_rate": 0.0002, "epoch": 2.0052083333333335, "step": 1540}, {"loss": 1.5447, "grad_norm": 0.4302637577056885, "learning_rate": 0.0002, "epoch": 2.0182291666666665, "step": 1550}, {"loss": 1.5951, "grad_norm": 0.4496043026447296, "learning_rate": 0.0002, "epoch": 2.03125, "step": 1560}, {"loss": 1.6032, "grad_norm": 0.42824679613113403, "learning_rate": 0.0002, "epoch": 2.0442708333333335, "step": 1570}, {"loss": 1.5996, "grad_norm": 0.44775739312171936, "learning_rate": 0.0002, "epoch": 2.0572916666666665, "step": 1580}, {"loss": 1.571, "grad_norm": 0.4705299735069275, "learning_rate": 0.0002, "epoch": 2.0703125, "step": 1590}, {"loss": 1.7589, "grad_norm": 0.4614814817905426, "learning_rate": 0.0002, "epoch": 2.0833333333333335, "step": 1600}, {"loss": 1.5762, "grad_norm": 0.45097213983535767, "learning_rate": 0.0002, "epoch": 2.0963541666666665, "step": 1610}, {"loss": 1.4947, "grad_norm": 0.41954323649406433, "learning_rate": 0.0002, "epoch": 2.109375, "step": 1620}, {"loss": 1.6397, "grad_norm": 0.44894352555274963, "learning_rate": 0.0002, "epoch": 2.1223958333333335, "step": 1630}, {"loss": 1.5251, "grad_norm": 0.4421502947807312, "learning_rate": 0.0002, "epoch": 2.1354166666666665, "step": 1640}, {"loss": 1.5931, "grad_norm": 0.44649967551231384, "learning_rate": 0.0002, "epoch": 2.1484375, "step": 1650}, {"loss": 1.6327, "grad_norm": 0.44216716289520264, "learning_rate": 0.0002, "epoch": 2.1614583333333335, "step": 1660}, {"loss": 1.5924, "grad_norm": 0.6363232135772705, "learning_rate": 0.0002, "epoch": 2.1744791666666665, "step": 1670}, {"loss": 1.6151, "grad_norm": 0.46533334255218506, "learning_rate": 0.0002, "epoch": 2.1875, "step": 1680}, {"loss": 1.5539, "grad_norm": 0.48486822843551636, "learning_rate": 0.0002, "epoch": 2.2005208333333335, "step": 1690}, {"loss": 1.6322, "grad_norm": 0.43277066946029663, "learning_rate": 0.0002, "epoch": 2.2135416666666665, "step": 1700}, {"loss": 1.4979, "grad_norm": 0.45927226543426514, "learning_rate": 0.0002, "epoch": 2.2265625, "step": 1710}, {"loss": 1.5917, "grad_norm": 0.4654010236263275, "learning_rate": 0.0002, "epoch": 2.2395833333333335, "step": 1720}, {"loss": 1.5713, "grad_norm": 0.49796584248542786, "learning_rate": 0.0002, "epoch": 2.2526041666666665, "step": 1730}, {"loss": 1.587, "grad_norm": 0.4506736397743225, "learning_rate": 0.0002, "epoch": 2.265625, "step": 1740}, {"loss": 1.5961, "grad_norm": 0.46757954359054565, "learning_rate": 0.0002, "epoch": 2.2786458333333335, "step": 1750}, {"loss": 1.6307, "grad_norm": 0.4507335424423218, "learning_rate": 0.0002, "epoch": 2.2916666666666665, "step": 1760}, {"loss": 1.5905, "grad_norm": 0.43900197744369507, "learning_rate": 0.0002, "epoch": 2.3046875, "step": 1770}, {"loss": 1.6655, "grad_norm": 0.48013004660606384, "learning_rate": 0.0002, "epoch": 2.3177083333333335, "step": 1780}, {"loss": 1.6024, "grad_norm": 0.41891220211982727, "learning_rate": 0.0002, "epoch": 2.3307291666666665, "step": 1790}, {"loss": 1.658, "grad_norm": 0.4879191219806671, "learning_rate": 0.0002, "epoch": 2.34375, "step": 1800}, {"loss": 1.6084, "grad_norm": 0.46148231625556946, "learning_rate": 0.0002, "epoch": 2.3567708333333335, "step": 1810}, {"loss": 1.6072, "grad_norm": 0.5114223957061768, "learning_rate": 0.0002, "epoch": 2.3697916666666665, "step": 1820}, {"loss": 1.5505, "grad_norm": 0.4828612804412842, "learning_rate": 0.0002, "epoch": 2.3828125, "step": 1830}, {"loss": 1.571, "grad_norm": 0.4672335386276245, "learning_rate": 0.0002, "epoch": 2.3958333333333335, "step": 1840}, {"loss": 1.6156, "grad_norm": 0.4914792776107788, "learning_rate": 0.0002, "epoch": 2.4088541666666665, "step": 1850}, {"loss": 1.5356, "grad_norm": 0.44478079676628113, "learning_rate": 0.0002, "epoch": 2.421875, "step": 1860}, {"loss": 1.7262, "grad_norm": 0.4601325988769531, "learning_rate": 0.0002, "epoch": 2.4348958333333335, "step": 1870}, {"loss": 1.555, "grad_norm": 0.44539815187454224, "learning_rate": 0.0002, "epoch": 2.4479166666666665, "step": 1880}, {"loss": 1.5877, "grad_norm": 0.4532422125339508, "learning_rate": 0.0002, "epoch": 2.4609375, "step": 1890}, {"loss": 1.5574, "grad_norm": 0.5323562622070312, "learning_rate": 0.0002, "epoch": 2.4739583333333335, "step": 1900}, {"loss": 1.7014, "grad_norm": 0.5027516484260559, "learning_rate": 0.0002, "epoch": 2.4869791666666665, "step": 1910}, {"loss": 1.5471, "grad_norm": 0.4507808983325958, "learning_rate": 0.0002, "epoch": 2.5, "step": 1920}, {"loss": 1.613, "grad_norm": 0.4996422827243805, "learning_rate": 0.0002, "epoch": 2.5130208333333335, "step": 1930}, {"loss": 1.6412, "grad_norm": 0.4964800179004669, "learning_rate": 0.0002, "epoch": 2.5260416666666665, "step": 1940}, {"loss": 1.547, "grad_norm": 0.48546481132507324, "learning_rate": 0.0002, "epoch": 2.5390625, "step": 1950}, {"loss": 1.6075, "grad_norm": 0.47357916831970215, "learning_rate": 0.0002, "epoch": 2.5520833333333335, "step": 1960}, {"loss": 1.5585, "grad_norm": 0.47136595845222473, "learning_rate": 0.0002, "epoch": 2.5651041666666665, "step": 1970}, {"loss": 1.5157, "grad_norm": 0.5185502171516418, "learning_rate": 0.0002, "epoch": 2.578125, "step": 1980}, {"loss": 1.6904, "grad_norm": 0.47995880246162415, "learning_rate": 0.0002, "epoch": 2.5911458333333335, "step": 1990}, {"loss": 1.638, "grad_norm": 0.5076674222946167, "learning_rate": 0.0002, "epoch": 2.6041666666666665, "step": 2000}, {"loss": 1.6038, "grad_norm": 0.4805421233177185, "learning_rate": 0.0002, "epoch": 2.6171875, "step": 2010}, {"loss": 1.6092, "grad_norm": 0.4406864047050476, "learning_rate": 0.0002, "epoch": 2.6302083333333335, "step": 2020}, {"loss": 1.6036, "grad_norm": 0.521388828754425, "learning_rate": 0.0002, "epoch": 2.6432291666666665, "step": 2030}, {"loss": 1.5338, "grad_norm": 0.4531918466091156, "learning_rate": 0.0002, "epoch": 2.65625, "step": 2040}, {"loss": 1.6853, "grad_norm": 0.45295774936676025, "learning_rate": 0.0002, "epoch": 2.6692708333333335, "step": 2050}, {"loss": 1.5252, "grad_norm": 0.4573723375797272, "learning_rate": 0.0002, "epoch": 2.6822916666666665, "step": 2060}, {"loss": 1.5765, "grad_norm": 0.4836064279079437, "learning_rate": 0.0002, "epoch": 2.6953125, "step": 2070}, {"loss": 1.5928, "grad_norm": 0.5040885210037231, "learning_rate": 0.0002, "epoch": 2.7083333333333335, "step": 2080}, {"loss": 1.6438, "grad_norm": 0.5153458118438721, "learning_rate": 0.0002, "epoch": 2.7213541666666665, "step": 2090}, {"loss": 1.5917, "grad_norm": 0.4415692090988159, "learning_rate": 0.0002, "epoch": 2.734375, "step": 2100}, {"loss": 1.6017, "grad_norm": 0.4862712621688843, "learning_rate": 0.0002, "epoch": 2.7473958333333335, "step": 2110}, {"loss": 1.5797, "grad_norm": 0.4845922589302063, "learning_rate": 0.0002, "epoch": 2.7604166666666665, "step": 2120}, {"loss": 1.6404, "grad_norm": 0.5153566598892212, "learning_rate": 0.0002, "epoch": 2.7734375, "step": 2130}, {"loss": 1.5609, "grad_norm": 0.4220491945743561, "learning_rate": 0.0002, "epoch": 2.7864583333333335, "step": 2140}, {"loss": 1.5404, "grad_norm": 0.523292064666748, "learning_rate": 0.0002, "epoch": 2.7994791666666665, "step": 2150}, {"loss": 1.4993, "grad_norm": 0.4567972421646118, "learning_rate": 0.0002, "epoch": 2.8125, "step": 2160}, {"loss": 1.6279, "grad_norm": 0.6252557039260864, "learning_rate": 0.0002, "epoch": 2.8255208333333335, "step": 2170}, {"loss": 1.6203, "grad_norm": 0.5231373310089111, "learning_rate": 0.0002, "epoch": 2.8385416666666665, "step": 2180}, {"loss": 1.5707, "grad_norm": 0.49243974685668945, "learning_rate": 0.0002, "epoch": 2.8515625, "step": 2190}, {"loss": 1.5923, "grad_norm": 0.521644115447998, "learning_rate": 0.0002, "epoch": 2.8645833333333335, "step": 2200}, {"loss": 1.6812, "grad_norm": 0.4624195694923401, "learning_rate": 0.0002, "epoch": 2.8776041666666665, "step": 2210}, {"loss": 1.6132, "grad_norm": 0.4463620185852051, "learning_rate": 0.0002, "epoch": 2.890625, "step": 2220}, {"loss": 1.6095, "grad_norm": 0.45793524384498596, "learning_rate": 0.0002, "epoch": 2.9036458333333335, "step": 2230}, {"loss": 1.5985, "grad_norm": 0.46979188919067383, "learning_rate": 0.0002, "epoch": 2.9166666666666665, "step": 2240}, {"loss": 1.617, "grad_norm": 0.5220303535461426, "learning_rate": 0.0002, "epoch": 2.9296875, "step": 2250}, {"loss": 1.5978, "grad_norm": 0.44405895471572876, "learning_rate": 0.0002, "epoch": 2.9427083333333335, "step": 2260}, {"loss": 1.6685, "grad_norm": 0.523841381072998, "learning_rate": 0.0002, "epoch": 2.9557291666666665, "step": 2270}, {"loss": 1.595, "grad_norm": 0.4928138852119446, "learning_rate": 0.0002, "epoch": 2.96875, "step": 2280}, {"loss": 1.606, "grad_norm": 0.4918071925640106, "learning_rate": 0.0002, "epoch": 2.9817708333333335, "step": 2290}, {"loss": 1.5736, "grad_norm": 0.4584912061691284, "learning_rate": 0.0002, "epoch": 2.9947916666666665, "step": 2300}, {"eval_loss": 1.8474308252334595, "eval_runtime": 103.7697, "eval_samples_per_second": 4.963, "eval_steps_per_second": 0.626, "epoch": 3.0, "step": 2304}, {"loss": 1.5454, "grad_norm": 0.4801871180534363, "learning_rate": 0.0002, "epoch": 3.0078125, "step": 2310}, {"loss": 1.4019, "grad_norm": 0.5789998173713684, "learning_rate": 0.0002, "epoch": 3.0208333333333335, "step": 2320}, {"loss": 1.4419, "grad_norm": 0.49856704473495483, "learning_rate": 0.0002, "epoch": 3.0338541666666665, "step": 2330}, {"loss": 1.4718, "grad_norm": 0.5625631213188171, "learning_rate": 0.0002, "epoch": 3.046875, "step": 2340}, {"loss": 1.4727, "grad_norm": 0.557637095451355, "learning_rate": 0.0002, "epoch": 3.0598958333333335, "step": 2350}, {"loss": 1.4654, "grad_norm": 0.528889536857605, "learning_rate": 0.0002, "epoch": 3.0729166666666665, "step": 2360}, {"loss": 1.4307, "grad_norm": 0.5952284932136536, "learning_rate": 0.0002, "epoch": 3.0859375, "step": 2370}, {"loss": 1.5304, "grad_norm": 0.5549899339675903, "learning_rate": 0.0002, "epoch": 3.0989583333333335, "step": 2380}, {"loss": 1.5034, "grad_norm": 0.662139892578125, "learning_rate": 0.0002, "epoch": 3.1119791666666665, "step": 2390}, {"loss": 1.4754, "grad_norm": 0.5281530618667603, "learning_rate": 0.0002, "epoch": 3.125, "step": 2400}, {"loss": 1.4047, "grad_norm": 0.6134106516838074, "learning_rate": 0.0002, "epoch": 3.1380208333333335, "step": 2410}, {"loss": 1.5001, "grad_norm": 0.6040887236595154, "learning_rate": 0.0002, "epoch": 3.1510416666666665, "step": 2420}, {"loss": 1.3936, "grad_norm": 0.549672544002533, "learning_rate": 0.0002, "epoch": 3.1640625, "step": 2430}, {"loss": 1.401, "grad_norm": 0.9195653796195984, "learning_rate": 0.0002, "epoch": 3.1770833333333335, "step": 2440}, {"loss": 1.507, "grad_norm": 0.5578703284263611, "learning_rate": 0.0002, "epoch": 3.1901041666666665, "step": 2450}, {"loss": 1.4873, "grad_norm": 0.5982925891876221, "learning_rate": 0.0002, "epoch": 3.203125, "step": 2460}, {"loss": 1.4909, "grad_norm": 0.5544393062591553, "learning_rate": 0.0002, "epoch": 3.2161458333333335, "step": 2470}, {"loss": 1.4705, "grad_norm": 0.6015266180038452, "learning_rate": 0.0002, "epoch": 3.2291666666666665, "step": 2480}, {"loss": 1.4652, "grad_norm": 0.5995243191719055, "learning_rate": 0.0002, "epoch": 3.2421875, "step": 2490}, {"loss": 1.4486, "grad_norm": 0.5846129059791565, "learning_rate": 0.0002, "epoch": 3.2552083333333335, "step": 2500}, {"loss": 1.4529, "grad_norm": 0.5552570223808289, "learning_rate": 0.0002, "epoch": 3.2682291666666665, "step": 2510}, {"loss": 1.3884, "grad_norm": 0.576998233795166, "learning_rate": 0.0002, "epoch": 3.28125, "step": 2520}, {"loss": 1.4463, "grad_norm": 0.6526138186454773, "learning_rate": 0.0002, "epoch": 3.2942708333333335, "step": 2530}, {"loss": 1.474, "grad_norm": 0.6064265966415405, "learning_rate": 0.0002, "epoch": 3.3072916666666665, "step": 2540}, {"loss": 1.5125, "grad_norm": 0.5542362928390503, "learning_rate": 0.0002, "epoch": 3.3203125, "step": 2550}, {"loss": 1.4769, "grad_norm": 0.6048482060432434, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2560}, {"loss": 1.4682, "grad_norm": 0.6328344941139221, "learning_rate": 0.0002, "epoch": 3.3463541666666665, "step": 2570}, {"loss": 1.5647, "grad_norm": 0.6347311735153198, "learning_rate": 0.0002, "epoch": 3.359375, "step": 2580}, {"loss": 1.5752, "grad_norm": 0.537570595741272, "learning_rate": 0.0002, "epoch": 3.3723958333333335, "step": 2590}, {"loss": 1.4086, "grad_norm": 0.5704807639122009, "learning_rate": 0.0002, "epoch": 3.3854166666666665, "step": 2600}, {"loss": 1.5653, "grad_norm": 0.5914373993873596, "learning_rate": 0.0002, "epoch": 3.3984375, "step": 2610}, {"loss": 1.4436, "grad_norm": 0.6724640130996704, "learning_rate": 0.0002, "epoch": 3.4114583333333335, "step": 2620}, {"loss": 1.5731, "grad_norm": 0.6295472383499146, "learning_rate": 0.0002, "epoch": 3.4244791666666665, "step": 2630}, {"loss": 1.4715, "grad_norm": 0.5842770934104919, "learning_rate": 0.0002, "epoch": 3.4375, "step": 2640}, {"loss": 1.451, "grad_norm": 0.6297776699066162, "learning_rate": 0.0002, "epoch": 3.4505208333333335, "step": 2650}, {"loss": 1.5761, "grad_norm": 0.6105847358703613, "learning_rate": 0.0002, "epoch": 3.4635416666666665, "step": 2660}, {"loss": 1.5332, "grad_norm": 0.6294940710067749, "learning_rate": 0.0002, "epoch": 3.4765625, "step": 2670}, {"loss": 1.5451, "grad_norm": 0.6573333740234375, "learning_rate": 0.0002, "epoch": 3.4895833333333335, "step": 2680}, {"loss": 1.4592, "grad_norm": 0.663661539554596, "learning_rate": 0.0002, "epoch": 3.5026041666666665, "step": 2690}, {"loss": 1.5286, "grad_norm": 0.6729148626327515, "learning_rate": 0.0002, "epoch": 3.515625, "step": 2700}, {"loss": 1.534, "grad_norm": 0.6633102893829346, "learning_rate": 0.0002, "epoch": 3.5286458333333335, "step": 2710}, {"loss": 1.4023, "grad_norm": 0.567686915397644, "learning_rate": 0.0002, "epoch": 3.5416666666666665, "step": 2720}, {"loss": 1.4925, "grad_norm": 0.6281962394714355, "learning_rate": 0.0002, "epoch": 3.5546875, "step": 2730}, {"loss": 1.5028, "grad_norm": 0.5710738897323608, "learning_rate": 0.0002, "epoch": 3.5677083333333335, "step": 2740}, {"loss": 1.4393, "grad_norm": 0.648162305355072, "learning_rate": 0.0002, "epoch": 3.5807291666666665, "step": 2750}, {"loss": 1.4294, "grad_norm": 0.5466254949569702, "learning_rate": 0.0002, "epoch": 3.59375, "step": 2760}, {"loss": 1.4993, "grad_norm": 0.6867973208427429, "learning_rate": 0.0002, "epoch": 3.6067708333333335, "step": 2770}, {"loss": 1.4463, "grad_norm": 0.673612117767334, "learning_rate": 0.0002, "epoch": 3.6197916666666665, "step": 2780}, {"loss": 1.5231, "grad_norm": 0.6928417086601257, "learning_rate": 0.0002, "epoch": 3.6328125, "step": 2790}, {"loss": 1.5212, "grad_norm": 0.6603742837905884, "learning_rate": 0.0002, "epoch": 3.6458333333333335, "step": 2800}, {"loss": 1.4889, "grad_norm": 0.5964401960372925, "learning_rate": 0.0002, "epoch": 3.6588541666666665, "step": 2810}, {"loss": 1.4585, "grad_norm": 0.6224474310874939, "learning_rate": 0.0002, "epoch": 3.671875, "step": 2820}, {"loss": 1.5119, "grad_norm": 0.6592439413070679, "learning_rate": 0.0002, "epoch": 3.6848958333333335, "step": 2830}, {"loss": 1.4729, "grad_norm": 0.6255369186401367, "learning_rate": 0.0002, "epoch": 3.6979166666666665, "step": 2840}, {"loss": 1.4598, "grad_norm": 0.7136337757110596, "learning_rate": 0.0002, "epoch": 3.7109375, "step": 2850}, {"loss": 1.4491, "grad_norm": 0.6229757070541382, "learning_rate": 0.0002, "epoch": 3.7239583333333335, "step": 2860}, {"loss": 1.4175, "grad_norm": 0.696080207824707, "learning_rate": 0.0002, "epoch": 3.7369791666666665, "step": 2870}, {"loss": 1.5127, "grad_norm": 0.571873664855957, "learning_rate": 0.0002, "epoch": 3.75, "step": 2880}, {"loss": 1.4093, "grad_norm": 0.5918916463851929, "learning_rate": 0.0002, "epoch": 3.7630208333333335, "step": 2890}, {"loss": 1.399, "grad_norm": 0.616413950920105, "learning_rate": 0.0002, "epoch": 3.7760416666666665, "step": 2900}, {"loss": 1.4215, "grad_norm": 0.6267292499542236, "learning_rate": 0.0002, "epoch": 3.7890625, "step": 2910}, {"loss": 1.5095, "grad_norm": 0.6630783677101135, "learning_rate": 0.0002, "epoch": 3.8020833333333335, "step": 2920}, {"loss": 1.5323, "grad_norm": 0.6004238724708557, "learning_rate": 0.0002, "epoch": 3.8151041666666665, "step": 2930}, {"loss": 1.4953, "grad_norm": 0.6740423440933228, "learning_rate": 0.0002, "epoch": 3.828125, "step": 2940}, {"loss": 1.549, "grad_norm": 0.6397785544395447, "learning_rate": 0.0002, "epoch": 3.8411458333333335, "step": 2950}, {"loss": 1.5309, "grad_norm": 0.6063735485076904, "learning_rate": 0.0002, "epoch": 3.8541666666666665, "step": 2960}, {"loss": 1.5093, "grad_norm": 0.6462053060531616, "learning_rate": 0.0002, "epoch": 3.8671875, "step": 2970}, {"loss": 1.5237, "grad_norm": 0.7143250107765198, "learning_rate": 0.0002, "epoch": 3.8802083333333335, "step": 2980}, {"loss": 1.4419, "grad_norm": 0.6747874617576599, "learning_rate": 0.0002, "epoch": 3.8932291666666665, "step": 2990}, {"loss": 1.5389, "grad_norm": 0.622930109500885, "learning_rate": 0.0002, "epoch": 3.90625, "step": 3000}, {"loss": 1.4279, "grad_norm": 0.620193600654602, "learning_rate": 0.0002, "epoch": 3.9192708333333335, "step": 3010}, {"loss": 1.495, "grad_norm": 0.6321487426757812, "learning_rate": 0.0002, "epoch": 3.9322916666666665, "step": 3020}, {"loss": 1.4657, "grad_norm": 0.5705523490905762, "learning_rate": 0.0002, "epoch": 3.9453125, "step": 3030}, {"loss": 1.4099, "grad_norm": 0.6185072660446167, "learning_rate": 0.0002, "epoch": 3.9583333333333335, "step": 3040}, {"loss": 1.4667, "grad_norm": 0.6005704998970032, "learning_rate": 0.0002, "epoch": 3.9713541666666665, "step": 3050}, {"loss": 1.4896, "grad_norm": 0.5933769941329956, "learning_rate": 0.0002, "epoch": 3.984375, "step": 3060}, {"loss": 1.4973, "grad_norm": 0.695209801197052, "learning_rate": 0.0002, "epoch": 3.9973958333333335, "step": 3070}]} +{"epoch": 5.0, "step": 3840, "epoch_duration": 2410.235536336899, "total_accumulated_duration": 12122.999557971954, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}, {"eval_loss": 1.8215787410736084, "eval_runtime": 102.4906, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.634, "epoch": 1.0, "step": 768}, {"loss": 1.8072, "grad_norm": 0.303970068693161, "learning_rate": 0.0002, "epoch": 1.0026041666666667, "step": 770}, {"loss": 1.6708, "grad_norm": 0.32745876908302307, "learning_rate": 0.0002, "epoch": 1.015625, "step": 780}, {"loss": 1.623, "grad_norm": 0.33467888832092285, "learning_rate": 0.0002, "epoch": 1.0286458333333333, "step": 790}, {"loss": 1.746, "grad_norm": 0.38253068923950195, "learning_rate": 0.0002, "epoch": 1.0416666666666667, "step": 800}, {"loss": 1.685, "grad_norm": 0.3955802023410797, "learning_rate": 0.0002, "epoch": 1.0546875, "step": 810}, {"loss": 1.7395, "grad_norm": 0.3534117043018341, "learning_rate": 0.0002, "epoch": 1.0677083333333333, "step": 820}, {"loss": 1.6361, "grad_norm": 0.33427858352661133, "learning_rate": 0.0002, "epoch": 1.0807291666666667, "step": 830}, {"loss": 1.7435, "grad_norm": 0.35261571407318115, "learning_rate": 0.0002, "epoch": 1.09375, "step": 840}, {"loss": 1.7112, "grad_norm": 0.4416263997554779, "learning_rate": 0.0002, "epoch": 1.1067708333333333, "step": 850}, {"loss": 1.6311, "grad_norm": 0.3918050229549408, "learning_rate": 0.0002, "epoch": 1.1197916666666667, "step": 860}, {"loss": 1.6804, "grad_norm": 0.38482677936553955, "learning_rate": 0.0002, "epoch": 1.1328125, "step": 870}, {"loss": 1.6951, "grad_norm": 0.4945143759250641, "learning_rate": 0.0002, "epoch": 1.1458333333333333, "step": 880}, {"loss": 1.7577, "grad_norm": 0.429677814245224, "learning_rate": 0.0002, "epoch": 1.1588541666666667, "step": 890}, {"loss": 1.7204, "grad_norm": 0.41878288984298706, "learning_rate": 0.0002, "epoch": 1.171875, "step": 900}, {"loss": 1.717, "grad_norm": 0.41578373312950134, "learning_rate": 0.0002, "epoch": 1.1848958333333333, "step": 910}, {"loss": 1.7017, "grad_norm": 0.37028902769088745, "learning_rate": 0.0002, "epoch": 1.1979166666666667, "step": 920}, {"loss": 1.7074, "grad_norm": 0.3824995756149292, "learning_rate": 0.0002, "epoch": 1.2109375, "step": 930}, {"loss": 1.6185, "grad_norm": 0.3818865418434143, "learning_rate": 0.0002, "epoch": 1.2239583333333333, "step": 940}, {"loss": 1.7894, "grad_norm": 0.3930460810661316, "learning_rate": 0.0002, "epoch": 1.2369791666666667, "step": 950}, {"loss": 1.6766, "grad_norm": 0.3904426395893097, "learning_rate": 0.0002, "epoch": 1.25, "step": 960}, {"loss": 1.7072, "grad_norm": 0.4175802171230316, "learning_rate": 0.0002, "epoch": 1.2630208333333333, "step": 970}, {"loss": 1.7556, "grad_norm": 0.42343786358833313, "learning_rate": 0.0002, "epoch": 1.2760416666666667, "step": 980}, {"loss": 1.6339, "grad_norm": 0.4168420135974884, "learning_rate": 0.0002, "epoch": 1.2890625, "step": 990}, {"loss": 1.727, "grad_norm": 0.38692983984947205, "learning_rate": 0.0002, "epoch": 1.3020833333333333, "step": 1000}, {"loss": 1.6384, "grad_norm": 0.5037692189216614, "learning_rate": 0.0002, "epoch": 1.3151041666666667, "step": 1010}, {"loss": 1.6878, "grad_norm": 0.39436691999435425, "learning_rate": 0.0002, "epoch": 1.328125, "step": 1020}, {"loss": 1.7113, "grad_norm": 0.3431943356990814, "learning_rate": 0.0002, "epoch": 1.3411458333333333, "step": 1030}, {"loss": 1.7034, "grad_norm": 0.39167070388793945, "learning_rate": 0.0002, "epoch": 1.3541666666666667, "step": 1040}, {"loss": 1.7108, "grad_norm": 0.3820446729660034, "learning_rate": 0.0002, "epoch": 1.3671875, "step": 1050}, {"loss": 1.7885, "grad_norm": 0.4190749526023865, "learning_rate": 0.0002, "epoch": 1.3802083333333333, "step": 1060}, {"loss": 1.7548, "grad_norm": 0.3618869185447693, "learning_rate": 0.0002, "epoch": 1.3932291666666667, "step": 1070}, {"loss": 1.6199, "grad_norm": 0.38852423429489136, "learning_rate": 0.0002, "epoch": 1.40625, "step": 1080}, {"loss": 1.733, "grad_norm": 0.49829256534576416, "learning_rate": 0.0002, "epoch": 1.4192708333333333, "step": 1090}, {"loss": 1.6589, "grad_norm": 0.3956700563430786, "learning_rate": 0.0002, "epoch": 1.4322916666666667, "step": 1100}, {"loss": 1.5866, "grad_norm": 0.38829147815704346, "learning_rate": 0.0002, "epoch": 1.4453125, "step": 1110}, {"loss": 1.6709, "grad_norm": 0.37237483263015747, "learning_rate": 0.0002, "epoch": 1.4583333333333333, "step": 1120}, {"loss": 1.64, "grad_norm": 0.39798808097839355, "learning_rate": 0.0002, "epoch": 1.4713541666666667, "step": 1130}, {"loss": 1.7484, "grad_norm": 0.38188642263412476, "learning_rate": 0.0002, "epoch": 1.484375, "step": 1140}, {"loss": 1.6707, "grad_norm": 0.44961944222450256, "learning_rate": 0.0002, "epoch": 1.4973958333333333, "step": 1150}, {"loss": 1.6241, "grad_norm": 0.3816550374031067, "learning_rate": 0.0002, "epoch": 1.5104166666666665, "step": 1160}, {"loss": 1.7606, "grad_norm": 0.3885478973388672, "learning_rate": 0.0002, "epoch": 1.5234375, "step": 1170}, {"loss": 1.7285, "grad_norm": 0.42779695987701416, "learning_rate": 0.0002, "epoch": 1.5364583333333335, "step": 1180}, {"loss": 1.7399, "grad_norm": 0.41499748826026917, "learning_rate": 0.0002, "epoch": 1.5494791666666665, "step": 1190}, {"loss": 1.6569, "grad_norm": 0.4319412410259247, "learning_rate": 0.0002, "epoch": 1.5625, "step": 1200}, {"loss": 1.7297, "grad_norm": 0.38847389817237854, "learning_rate": 0.0002, "epoch": 1.5755208333333335, "step": 1210}, {"loss": 1.6666, "grad_norm": 0.45832890272140503, "learning_rate": 0.0002, "epoch": 1.5885416666666665, "step": 1220}, {"loss": 1.68, "grad_norm": 0.45928797125816345, "learning_rate": 0.0002, "epoch": 1.6015625, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.4052276611328125, "learning_rate": 0.0002, "epoch": 1.6145833333333335, "step": 1240}, {"loss": 1.6722, "grad_norm": 0.4031650424003601, "learning_rate": 0.0002, "epoch": 1.6276041666666665, "step": 1250}, {"loss": 1.7243, "grad_norm": 0.36724114418029785, "learning_rate": 0.0002, "epoch": 1.640625, "step": 1260}, {"loss": 1.7672, "grad_norm": 0.4188505709171295, "learning_rate": 0.0002, "epoch": 1.6536458333333335, "step": 1270}, {"loss": 1.7685, "grad_norm": 0.3982168138027191, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1280}, {"loss": 1.6831, "grad_norm": 0.3768596053123474, "learning_rate": 0.0002, "epoch": 1.6796875, "step": 1290}, {"loss": 1.6868, "grad_norm": 0.3843287527561188, "learning_rate": 0.0002, "epoch": 1.6927083333333335, "step": 1300}, {"loss": 1.6188, "grad_norm": 0.3982345461845398, "learning_rate": 0.0002, "epoch": 1.7057291666666665, "step": 1310}, {"loss": 1.7084, "grad_norm": 0.3407546281814575, "learning_rate": 0.0002, "epoch": 1.71875, "step": 1320}, {"loss": 1.7316, "grad_norm": 0.36327359080314636, "learning_rate": 0.0002, "epoch": 1.7317708333333335, "step": 1330}, {"loss": 1.734, "grad_norm": 0.4141675531864166, "learning_rate": 0.0002, "epoch": 1.7447916666666665, "step": 1340}, {"loss": 1.7257, "grad_norm": 0.43894267082214355, "learning_rate": 0.0002, "epoch": 1.7578125, "step": 1350}, {"loss": 1.6613, "grad_norm": 0.40564292669296265, "learning_rate": 0.0002, "epoch": 1.7708333333333335, "step": 1360}, {"loss": 1.6841, "grad_norm": 0.3978462815284729, "learning_rate": 0.0002, "epoch": 1.7838541666666665, "step": 1370}, {"loss": 1.6497, "grad_norm": 0.37140771746635437, "learning_rate": 0.0002, "epoch": 1.796875, "step": 1380}, {"loss": 1.742, "grad_norm": 0.43164145946502686, "learning_rate": 0.0002, "epoch": 1.8098958333333335, "step": 1390}, {"loss": 1.7253, "grad_norm": 0.38034674525260925, "learning_rate": 0.0002, "epoch": 1.8229166666666665, "step": 1400}, {"loss": 1.652, "grad_norm": 0.4235687851905823, "learning_rate": 0.0002, "epoch": 1.8359375, "step": 1410}, {"loss": 1.752, "grad_norm": 0.37417489290237427, "learning_rate": 0.0002, "epoch": 1.8489583333333335, "step": 1420}, {"loss": 1.6995, "grad_norm": 0.4303789734840393, "learning_rate": 0.0002, "epoch": 1.8619791666666665, "step": 1430}, {"loss": 1.6489, "grad_norm": 0.43942129611968994, "learning_rate": 0.0002, "epoch": 1.875, "step": 1440}, {"loss": 1.7989, "grad_norm": 0.3866581320762634, "learning_rate": 0.0002, "epoch": 1.8880208333333335, "step": 1450}, {"loss": 1.72, "grad_norm": 0.3686903417110443, "learning_rate": 0.0002, "epoch": 1.9010416666666665, "step": 1460}, {"loss": 1.6545, "grad_norm": 0.3885461986064911, "learning_rate": 0.0002, "epoch": 1.9140625, "step": 1470}, {"loss": 1.6981, "grad_norm": 0.4156927466392517, "learning_rate": 0.0002, "epoch": 1.9270833333333335, "step": 1480}, {"loss": 1.5921, "grad_norm": 0.3934236168861389, "learning_rate": 0.0002, "epoch": 1.9401041666666665, "step": 1490}, {"loss": 1.7384, "grad_norm": 0.38645586371421814, "learning_rate": 0.0002, "epoch": 1.953125, "step": 1500}, {"loss": 1.7033, "grad_norm": 0.43272635340690613, "learning_rate": 0.0002, "epoch": 1.9661458333333335, "step": 1510}, {"loss": 1.6138, "grad_norm": 0.42476025223731995, "learning_rate": 0.0002, "epoch": 1.9791666666666665, "step": 1520}, {"loss": 1.5834, "grad_norm": 0.37216147780418396, "learning_rate": 0.0002, "epoch": 1.9921875, "step": 1530}, {"eval_loss": 1.820037841796875, "eval_runtime": 101.0456, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.643, "epoch": 2.0, "step": 1536}, {"loss": 1.6395, "grad_norm": 0.39003029465675354, "learning_rate": 0.0002, "epoch": 2.0052083333333335, "step": 1540}, {"loss": 1.5447, "grad_norm": 0.4302637577056885, "learning_rate": 0.0002, "epoch": 2.0182291666666665, "step": 1550}, {"loss": 1.5951, "grad_norm": 0.4496043026447296, "learning_rate": 0.0002, "epoch": 2.03125, "step": 1560}, {"loss": 1.6032, "grad_norm": 0.42824679613113403, "learning_rate": 0.0002, "epoch": 2.0442708333333335, "step": 1570}, {"loss": 1.5996, "grad_norm": 0.44775739312171936, "learning_rate": 0.0002, "epoch": 2.0572916666666665, "step": 1580}, {"loss": 1.571, "grad_norm": 0.4705299735069275, "learning_rate": 0.0002, "epoch": 2.0703125, "step": 1590}, {"loss": 1.7589, "grad_norm": 0.4614814817905426, "learning_rate": 0.0002, "epoch": 2.0833333333333335, "step": 1600}, {"loss": 1.5762, "grad_norm": 0.45097213983535767, "learning_rate": 0.0002, "epoch": 2.0963541666666665, "step": 1610}, {"loss": 1.4947, "grad_norm": 0.41954323649406433, "learning_rate": 0.0002, "epoch": 2.109375, "step": 1620}, {"loss": 1.6397, "grad_norm": 0.44894352555274963, "learning_rate": 0.0002, "epoch": 2.1223958333333335, "step": 1630}, {"loss": 1.5251, "grad_norm": 0.4421502947807312, "learning_rate": 0.0002, "epoch": 2.1354166666666665, "step": 1640}, {"loss": 1.5931, "grad_norm": 0.44649967551231384, "learning_rate": 0.0002, "epoch": 2.1484375, "step": 1650}, {"loss": 1.6327, "grad_norm": 0.44216716289520264, "learning_rate": 0.0002, "epoch": 2.1614583333333335, "step": 1660}, {"loss": 1.5924, "grad_norm": 0.6363232135772705, "learning_rate": 0.0002, "epoch": 2.1744791666666665, "step": 1670}, {"loss": 1.6151, "grad_norm": 0.46533334255218506, "learning_rate": 0.0002, "epoch": 2.1875, "step": 1680}, {"loss": 1.5539, "grad_norm": 0.48486822843551636, "learning_rate": 0.0002, "epoch": 2.2005208333333335, "step": 1690}, {"loss": 1.6322, "grad_norm": 0.43277066946029663, "learning_rate": 0.0002, "epoch": 2.2135416666666665, "step": 1700}, {"loss": 1.4979, "grad_norm": 0.45927226543426514, "learning_rate": 0.0002, "epoch": 2.2265625, "step": 1710}, {"loss": 1.5917, "grad_norm": 0.4654010236263275, "learning_rate": 0.0002, "epoch": 2.2395833333333335, "step": 1720}, {"loss": 1.5713, "grad_norm": 0.49796584248542786, "learning_rate": 0.0002, "epoch": 2.2526041666666665, "step": 1730}, {"loss": 1.587, "grad_norm": 0.4506736397743225, "learning_rate": 0.0002, "epoch": 2.265625, "step": 1740}, {"loss": 1.5961, "grad_norm": 0.46757954359054565, "learning_rate": 0.0002, "epoch": 2.2786458333333335, "step": 1750}, {"loss": 1.6307, "grad_norm": 0.4507335424423218, "learning_rate": 0.0002, "epoch": 2.2916666666666665, "step": 1760}, {"loss": 1.5905, "grad_norm": 0.43900197744369507, "learning_rate": 0.0002, "epoch": 2.3046875, "step": 1770}, {"loss": 1.6655, "grad_norm": 0.48013004660606384, "learning_rate": 0.0002, "epoch": 2.3177083333333335, "step": 1780}, {"loss": 1.6024, "grad_norm": 0.41891220211982727, "learning_rate": 0.0002, "epoch": 2.3307291666666665, "step": 1790}, {"loss": 1.658, "grad_norm": 0.4879191219806671, "learning_rate": 0.0002, "epoch": 2.34375, "step": 1800}, {"loss": 1.6084, "grad_norm": 0.46148231625556946, "learning_rate": 0.0002, "epoch": 2.3567708333333335, "step": 1810}, {"loss": 1.6072, "grad_norm": 0.5114223957061768, "learning_rate": 0.0002, "epoch": 2.3697916666666665, "step": 1820}, {"loss": 1.5505, "grad_norm": 0.4828612804412842, "learning_rate": 0.0002, "epoch": 2.3828125, "step": 1830}, {"loss": 1.571, "grad_norm": 0.4672335386276245, "learning_rate": 0.0002, "epoch": 2.3958333333333335, "step": 1840}, {"loss": 1.6156, "grad_norm": 0.4914792776107788, "learning_rate": 0.0002, "epoch": 2.4088541666666665, "step": 1850}, {"loss": 1.5356, "grad_norm": 0.44478079676628113, "learning_rate": 0.0002, "epoch": 2.421875, "step": 1860}, {"loss": 1.7262, "grad_norm": 0.4601325988769531, "learning_rate": 0.0002, "epoch": 2.4348958333333335, "step": 1870}, {"loss": 1.555, "grad_norm": 0.44539815187454224, "learning_rate": 0.0002, "epoch": 2.4479166666666665, "step": 1880}, {"loss": 1.5877, "grad_norm": 0.4532422125339508, "learning_rate": 0.0002, "epoch": 2.4609375, "step": 1890}, {"loss": 1.5574, "grad_norm": 0.5323562622070312, "learning_rate": 0.0002, "epoch": 2.4739583333333335, "step": 1900}, {"loss": 1.7014, "grad_norm": 0.5027516484260559, "learning_rate": 0.0002, "epoch": 2.4869791666666665, "step": 1910}, {"loss": 1.5471, "grad_norm": 0.4507808983325958, "learning_rate": 0.0002, "epoch": 2.5, "step": 1920}, {"loss": 1.613, "grad_norm": 0.4996422827243805, "learning_rate": 0.0002, "epoch": 2.5130208333333335, "step": 1930}, {"loss": 1.6412, "grad_norm": 0.4964800179004669, "learning_rate": 0.0002, "epoch": 2.5260416666666665, "step": 1940}, {"loss": 1.547, "grad_norm": 0.48546481132507324, "learning_rate": 0.0002, "epoch": 2.5390625, "step": 1950}, {"loss": 1.6075, "grad_norm": 0.47357916831970215, "learning_rate": 0.0002, "epoch": 2.5520833333333335, "step": 1960}, {"loss": 1.5585, "grad_norm": 0.47136595845222473, "learning_rate": 0.0002, "epoch": 2.5651041666666665, "step": 1970}, {"loss": 1.5157, "grad_norm": 0.5185502171516418, "learning_rate": 0.0002, "epoch": 2.578125, "step": 1980}, {"loss": 1.6904, "grad_norm": 0.47995880246162415, "learning_rate": 0.0002, "epoch": 2.5911458333333335, "step": 1990}, {"loss": 1.638, "grad_norm": 0.5076674222946167, "learning_rate": 0.0002, "epoch": 2.6041666666666665, "step": 2000}, {"loss": 1.6038, "grad_norm": 0.4805421233177185, "learning_rate": 0.0002, "epoch": 2.6171875, "step": 2010}, {"loss": 1.6092, "grad_norm": 0.4406864047050476, "learning_rate": 0.0002, "epoch": 2.6302083333333335, "step": 2020}, {"loss": 1.6036, "grad_norm": 0.521388828754425, "learning_rate": 0.0002, "epoch": 2.6432291666666665, "step": 2030}, {"loss": 1.5338, "grad_norm": 0.4531918466091156, "learning_rate": 0.0002, "epoch": 2.65625, "step": 2040}, {"loss": 1.6853, "grad_norm": 0.45295774936676025, "learning_rate": 0.0002, "epoch": 2.6692708333333335, "step": 2050}, {"loss": 1.5252, "grad_norm": 0.4573723375797272, "learning_rate": 0.0002, "epoch": 2.6822916666666665, "step": 2060}, {"loss": 1.5765, "grad_norm": 0.4836064279079437, "learning_rate": 0.0002, "epoch": 2.6953125, "step": 2070}, {"loss": 1.5928, "grad_norm": 0.5040885210037231, "learning_rate": 0.0002, "epoch": 2.7083333333333335, "step": 2080}, {"loss": 1.6438, "grad_norm": 0.5153458118438721, "learning_rate": 0.0002, "epoch": 2.7213541666666665, "step": 2090}, {"loss": 1.5917, "grad_norm": 0.4415692090988159, "learning_rate": 0.0002, "epoch": 2.734375, "step": 2100}, {"loss": 1.6017, "grad_norm": 0.4862712621688843, "learning_rate": 0.0002, "epoch": 2.7473958333333335, "step": 2110}, {"loss": 1.5797, "grad_norm": 0.4845922589302063, "learning_rate": 0.0002, "epoch": 2.7604166666666665, "step": 2120}, {"loss": 1.6404, "grad_norm": 0.5153566598892212, "learning_rate": 0.0002, "epoch": 2.7734375, "step": 2130}, {"loss": 1.5609, "grad_norm": 0.4220491945743561, "learning_rate": 0.0002, "epoch": 2.7864583333333335, "step": 2140}, {"loss": 1.5404, "grad_norm": 0.523292064666748, "learning_rate": 0.0002, "epoch": 2.7994791666666665, "step": 2150}, {"loss": 1.4993, "grad_norm": 0.4567972421646118, "learning_rate": 0.0002, "epoch": 2.8125, "step": 2160}, {"loss": 1.6279, "grad_norm": 0.6252557039260864, "learning_rate": 0.0002, "epoch": 2.8255208333333335, "step": 2170}, {"loss": 1.6203, "grad_norm": 0.5231373310089111, "learning_rate": 0.0002, "epoch": 2.8385416666666665, "step": 2180}, {"loss": 1.5707, "grad_norm": 0.49243974685668945, "learning_rate": 0.0002, "epoch": 2.8515625, "step": 2190}, {"loss": 1.5923, "grad_norm": 0.521644115447998, "learning_rate": 0.0002, "epoch": 2.8645833333333335, "step": 2200}, {"loss": 1.6812, "grad_norm": 0.4624195694923401, "learning_rate": 0.0002, "epoch": 2.8776041666666665, "step": 2210}, {"loss": 1.6132, "grad_norm": 0.4463620185852051, "learning_rate": 0.0002, "epoch": 2.890625, "step": 2220}, {"loss": 1.6095, "grad_norm": 0.45793524384498596, "learning_rate": 0.0002, "epoch": 2.9036458333333335, "step": 2230}, {"loss": 1.5985, "grad_norm": 0.46979188919067383, "learning_rate": 0.0002, "epoch": 2.9166666666666665, "step": 2240}, {"loss": 1.617, "grad_norm": 0.5220303535461426, "learning_rate": 0.0002, "epoch": 2.9296875, "step": 2250}, {"loss": 1.5978, "grad_norm": 0.44405895471572876, "learning_rate": 0.0002, "epoch": 2.9427083333333335, "step": 2260}, {"loss": 1.6685, "grad_norm": 0.523841381072998, "learning_rate": 0.0002, "epoch": 2.9557291666666665, "step": 2270}, {"loss": 1.595, "grad_norm": 0.4928138852119446, "learning_rate": 0.0002, "epoch": 2.96875, "step": 2280}, {"loss": 1.606, "grad_norm": 0.4918071925640106, "learning_rate": 0.0002, "epoch": 2.9817708333333335, "step": 2290}, {"loss": 1.5736, "grad_norm": 0.4584912061691284, "learning_rate": 0.0002, "epoch": 2.9947916666666665, "step": 2300}, {"eval_loss": 1.8474308252334595, "eval_runtime": 103.7697, "eval_samples_per_second": 4.963, "eval_steps_per_second": 0.626, "epoch": 3.0, "step": 2304}, {"loss": 1.5454, "grad_norm": 0.4801871180534363, "learning_rate": 0.0002, "epoch": 3.0078125, "step": 2310}, {"loss": 1.4019, "grad_norm": 0.5789998173713684, "learning_rate": 0.0002, "epoch": 3.0208333333333335, "step": 2320}, {"loss": 1.4419, "grad_norm": 0.49856704473495483, "learning_rate": 0.0002, "epoch": 3.0338541666666665, "step": 2330}, {"loss": 1.4718, "grad_norm": 0.5625631213188171, "learning_rate": 0.0002, "epoch": 3.046875, "step": 2340}, {"loss": 1.4727, "grad_norm": 0.557637095451355, "learning_rate": 0.0002, "epoch": 3.0598958333333335, "step": 2350}, {"loss": 1.4654, "grad_norm": 0.528889536857605, "learning_rate": 0.0002, "epoch": 3.0729166666666665, "step": 2360}, {"loss": 1.4307, "grad_norm": 0.5952284932136536, "learning_rate": 0.0002, "epoch": 3.0859375, "step": 2370}, {"loss": 1.5304, "grad_norm": 0.5549899339675903, "learning_rate": 0.0002, "epoch": 3.0989583333333335, "step": 2380}, {"loss": 1.5034, "grad_norm": 0.662139892578125, "learning_rate": 0.0002, "epoch": 3.1119791666666665, "step": 2390}, {"loss": 1.4754, "grad_norm": 0.5281530618667603, "learning_rate": 0.0002, "epoch": 3.125, "step": 2400}, {"loss": 1.4047, "grad_norm": 0.6134106516838074, "learning_rate": 0.0002, "epoch": 3.1380208333333335, "step": 2410}, {"loss": 1.5001, "grad_norm": 0.6040887236595154, "learning_rate": 0.0002, "epoch": 3.1510416666666665, "step": 2420}, {"loss": 1.3936, "grad_norm": 0.549672544002533, "learning_rate": 0.0002, "epoch": 3.1640625, "step": 2430}, {"loss": 1.401, "grad_norm": 0.9195653796195984, "learning_rate": 0.0002, "epoch": 3.1770833333333335, "step": 2440}, {"loss": 1.507, "grad_norm": 0.5578703284263611, "learning_rate": 0.0002, "epoch": 3.1901041666666665, "step": 2450}, {"loss": 1.4873, "grad_norm": 0.5982925891876221, "learning_rate": 0.0002, "epoch": 3.203125, "step": 2460}, {"loss": 1.4909, "grad_norm": 0.5544393062591553, "learning_rate": 0.0002, "epoch": 3.2161458333333335, "step": 2470}, {"loss": 1.4705, "grad_norm": 0.6015266180038452, "learning_rate": 0.0002, "epoch": 3.2291666666666665, "step": 2480}, {"loss": 1.4652, "grad_norm": 0.5995243191719055, "learning_rate": 0.0002, "epoch": 3.2421875, "step": 2490}, {"loss": 1.4486, "grad_norm": 0.5846129059791565, "learning_rate": 0.0002, "epoch": 3.2552083333333335, "step": 2500}, {"loss": 1.4529, "grad_norm": 0.5552570223808289, "learning_rate": 0.0002, "epoch": 3.2682291666666665, "step": 2510}, {"loss": 1.3884, "grad_norm": 0.576998233795166, "learning_rate": 0.0002, "epoch": 3.28125, "step": 2520}, {"loss": 1.4463, "grad_norm": 0.6526138186454773, "learning_rate": 0.0002, "epoch": 3.2942708333333335, "step": 2530}, {"loss": 1.474, "grad_norm": 0.6064265966415405, "learning_rate": 0.0002, "epoch": 3.3072916666666665, "step": 2540}, {"loss": 1.5125, "grad_norm": 0.5542362928390503, "learning_rate": 0.0002, "epoch": 3.3203125, "step": 2550}, {"loss": 1.4769, "grad_norm": 0.6048482060432434, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2560}, {"loss": 1.4682, "grad_norm": 0.6328344941139221, "learning_rate": 0.0002, "epoch": 3.3463541666666665, "step": 2570}, {"loss": 1.5647, "grad_norm": 0.6347311735153198, "learning_rate": 0.0002, "epoch": 3.359375, "step": 2580}, {"loss": 1.5752, "grad_norm": 0.537570595741272, "learning_rate": 0.0002, "epoch": 3.3723958333333335, "step": 2590}, {"loss": 1.4086, "grad_norm": 0.5704807639122009, "learning_rate": 0.0002, "epoch": 3.3854166666666665, "step": 2600}, {"loss": 1.5653, "grad_norm": 0.5914373993873596, "learning_rate": 0.0002, "epoch": 3.3984375, "step": 2610}, {"loss": 1.4436, "grad_norm": 0.6724640130996704, "learning_rate": 0.0002, "epoch": 3.4114583333333335, "step": 2620}, {"loss": 1.5731, "grad_norm": 0.6295472383499146, "learning_rate": 0.0002, "epoch": 3.4244791666666665, "step": 2630}, {"loss": 1.4715, "grad_norm": 0.5842770934104919, "learning_rate": 0.0002, "epoch": 3.4375, "step": 2640}, {"loss": 1.451, "grad_norm": 0.6297776699066162, "learning_rate": 0.0002, "epoch": 3.4505208333333335, "step": 2650}, {"loss": 1.5761, "grad_norm": 0.6105847358703613, "learning_rate": 0.0002, "epoch": 3.4635416666666665, "step": 2660}, {"loss": 1.5332, "grad_norm": 0.6294940710067749, "learning_rate": 0.0002, "epoch": 3.4765625, "step": 2670}, {"loss": 1.5451, "grad_norm": 0.6573333740234375, "learning_rate": 0.0002, "epoch": 3.4895833333333335, "step": 2680}, {"loss": 1.4592, "grad_norm": 0.663661539554596, "learning_rate": 0.0002, "epoch": 3.5026041666666665, "step": 2690}, {"loss": 1.5286, "grad_norm": 0.6729148626327515, "learning_rate": 0.0002, "epoch": 3.515625, "step": 2700}, {"loss": 1.534, "grad_norm": 0.6633102893829346, "learning_rate": 0.0002, "epoch": 3.5286458333333335, "step": 2710}, {"loss": 1.4023, "grad_norm": 0.567686915397644, "learning_rate": 0.0002, "epoch": 3.5416666666666665, "step": 2720}, {"loss": 1.4925, "grad_norm": 0.6281962394714355, "learning_rate": 0.0002, "epoch": 3.5546875, "step": 2730}, {"loss": 1.5028, "grad_norm": 0.5710738897323608, "learning_rate": 0.0002, "epoch": 3.5677083333333335, "step": 2740}, {"loss": 1.4393, "grad_norm": 0.648162305355072, "learning_rate": 0.0002, "epoch": 3.5807291666666665, "step": 2750}, {"loss": 1.4294, "grad_norm": 0.5466254949569702, "learning_rate": 0.0002, "epoch": 3.59375, "step": 2760}, {"loss": 1.4993, "grad_norm": 0.6867973208427429, "learning_rate": 0.0002, "epoch": 3.6067708333333335, "step": 2770}, {"loss": 1.4463, "grad_norm": 0.673612117767334, "learning_rate": 0.0002, "epoch": 3.6197916666666665, "step": 2780}, {"loss": 1.5231, "grad_norm": 0.6928417086601257, "learning_rate": 0.0002, "epoch": 3.6328125, "step": 2790}, {"loss": 1.5212, "grad_norm": 0.6603742837905884, "learning_rate": 0.0002, "epoch": 3.6458333333333335, "step": 2800}, {"loss": 1.4889, "grad_norm": 0.5964401960372925, "learning_rate": 0.0002, "epoch": 3.6588541666666665, "step": 2810}, {"loss": 1.4585, "grad_norm": 0.6224474310874939, "learning_rate": 0.0002, "epoch": 3.671875, "step": 2820}, {"loss": 1.5119, "grad_norm": 0.6592439413070679, "learning_rate": 0.0002, "epoch": 3.6848958333333335, "step": 2830}, {"loss": 1.4729, "grad_norm": 0.6255369186401367, "learning_rate": 0.0002, "epoch": 3.6979166666666665, "step": 2840}, {"loss": 1.4598, "grad_norm": 0.7136337757110596, "learning_rate": 0.0002, "epoch": 3.7109375, "step": 2850}, {"loss": 1.4491, "grad_norm": 0.6229757070541382, "learning_rate": 0.0002, "epoch": 3.7239583333333335, "step": 2860}, {"loss": 1.4175, "grad_norm": 0.696080207824707, "learning_rate": 0.0002, "epoch": 3.7369791666666665, "step": 2870}, {"loss": 1.5127, "grad_norm": 0.571873664855957, "learning_rate": 0.0002, "epoch": 3.75, "step": 2880}, {"loss": 1.4093, "grad_norm": 0.5918916463851929, "learning_rate": 0.0002, "epoch": 3.7630208333333335, "step": 2890}, {"loss": 1.399, "grad_norm": 0.616413950920105, "learning_rate": 0.0002, "epoch": 3.7760416666666665, "step": 2900}, {"loss": 1.4215, "grad_norm": 0.6267292499542236, "learning_rate": 0.0002, "epoch": 3.7890625, "step": 2910}, {"loss": 1.5095, "grad_norm": 0.6630783677101135, "learning_rate": 0.0002, "epoch": 3.8020833333333335, "step": 2920}, {"loss": 1.5323, "grad_norm": 0.6004238724708557, "learning_rate": 0.0002, "epoch": 3.8151041666666665, "step": 2930}, {"loss": 1.4953, "grad_norm": 0.6740423440933228, "learning_rate": 0.0002, "epoch": 3.828125, "step": 2940}, {"loss": 1.549, "grad_norm": 0.6397785544395447, "learning_rate": 0.0002, "epoch": 3.8411458333333335, "step": 2950}, {"loss": 1.5309, "grad_norm": 0.6063735485076904, "learning_rate": 0.0002, "epoch": 3.8541666666666665, "step": 2960}, {"loss": 1.5093, "grad_norm": 0.6462053060531616, "learning_rate": 0.0002, "epoch": 3.8671875, "step": 2970}, {"loss": 1.5237, "grad_norm": 0.7143250107765198, "learning_rate": 0.0002, "epoch": 3.8802083333333335, "step": 2980}, {"loss": 1.4419, "grad_norm": 0.6747874617576599, "learning_rate": 0.0002, "epoch": 3.8932291666666665, "step": 2990}, {"loss": 1.5389, "grad_norm": 0.622930109500885, "learning_rate": 0.0002, "epoch": 3.90625, "step": 3000}, {"loss": 1.4279, "grad_norm": 0.620193600654602, "learning_rate": 0.0002, "epoch": 3.9192708333333335, "step": 3010}, {"loss": 1.495, "grad_norm": 0.6321487426757812, "learning_rate": 0.0002, "epoch": 3.9322916666666665, "step": 3020}, {"loss": 1.4657, "grad_norm": 0.5705523490905762, "learning_rate": 0.0002, "epoch": 3.9453125, "step": 3030}, {"loss": 1.4099, "grad_norm": 0.6185072660446167, "learning_rate": 0.0002, "epoch": 3.9583333333333335, "step": 3040}, {"loss": 1.4667, "grad_norm": 0.6005704998970032, "learning_rate": 0.0002, "epoch": 3.9713541666666665, "step": 3050}, {"loss": 1.4896, "grad_norm": 0.5933769941329956, "learning_rate": 0.0002, "epoch": 3.984375, "step": 3060}, {"loss": 1.4973, "grad_norm": 0.695209801197052, "learning_rate": 0.0002, "epoch": 3.9973958333333335, "step": 3070}, {"eval_loss": 1.8955267667770386, "eval_runtime": 103.5061, "eval_samples_per_second": 4.976, "eval_steps_per_second": 0.628, "epoch": 4.0, "step": 3072}, {"loss": 1.3502, "grad_norm": 0.6706188321113586, "learning_rate": 0.0002, "epoch": 4.010416666666667, "step": 3080}, {"loss": 1.2917, "grad_norm": 0.7263980507850647, "learning_rate": 0.0002, "epoch": 4.0234375, "step": 3090}, {"loss": 1.2845, "grad_norm": 0.7767240405082703, "learning_rate": 0.0002, "epoch": 4.036458333333333, "step": 3100}, {"loss": 1.4169, "grad_norm": 0.6888399124145508, "learning_rate": 0.0002, "epoch": 4.049479166666667, "step": 3110}, {"loss": 1.2422, "grad_norm": 0.8860331773757935, "learning_rate": 0.0002, "epoch": 4.0625, "step": 3120}, {"loss": 1.2842, "grad_norm": 0.7572373151779175, "learning_rate": 0.0002, "epoch": 4.075520833333333, "step": 3130}, {"loss": 1.2747, "grad_norm": 0.8321536183357239, "learning_rate": 0.0002, "epoch": 4.088541666666667, "step": 3140}, {"loss": 1.2843, "grad_norm": 0.7042664885520935, "learning_rate": 0.0002, "epoch": 4.1015625, "step": 3150}, {"loss": 1.3326, "grad_norm": 0.8910216689109802, "learning_rate": 0.0002, "epoch": 4.114583333333333, "step": 3160}, {"loss": 1.2742, "grad_norm": 0.8333232402801514, "learning_rate": 0.0002, "epoch": 4.127604166666667, "step": 3170}, {"loss": 1.2985, "grad_norm": 0.7120883464813232, "learning_rate": 0.0002, "epoch": 4.140625, "step": 3180}, {"loss": 1.3611, "grad_norm": 0.6904631853103638, "learning_rate": 0.0002, "epoch": 4.153645833333333, "step": 3190}, {"loss": 1.2881, "grad_norm": 0.6398878693580627, "learning_rate": 0.0002, "epoch": 4.166666666666667, "step": 3200}, {"loss": 1.3323, "grad_norm": 0.7573692798614502, "learning_rate": 0.0002, "epoch": 4.1796875, "step": 3210}, {"loss": 1.3509, "grad_norm": 0.7850743532180786, "learning_rate": 0.0002, "epoch": 4.192708333333333, "step": 3220}, {"loss": 1.3176, "grad_norm": 0.7863165736198425, "learning_rate": 0.0002, "epoch": 4.205729166666667, "step": 3230}, {"loss": 1.3739, "grad_norm": 0.7855865359306335, "learning_rate": 0.0002, "epoch": 4.21875, "step": 3240}, {"loss": 1.3251, "grad_norm": 0.6840922832489014, "learning_rate": 0.0002, "epoch": 4.231770833333333, "step": 3250}, {"loss": 1.32, "grad_norm": 0.8499747514724731, "learning_rate": 0.0002, "epoch": 4.244791666666667, "step": 3260}, {"loss": 1.4045, "grad_norm": 0.7982883453369141, "learning_rate": 0.0002, "epoch": 4.2578125, "step": 3270}, {"loss": 1.3922, "grad_norm": 0.7776934504508972, "learning_rate": 0.0002, "epoch": 4.270833333333333, "step": 3280}, {"loss": 1.309, "grad_norm": 0.8887693881988525, "learning_rate": 0.0002, "epoch": 4.283854166666667, "step": 3290}, {"loss": 1.3213, "grad_norm": 1.0184714794158936, "learning_rate": 0.0002, "epoch": 4.296875, "step": 3300}, {"loss": 1.3212, "grad_norm": 0.7539387345314026, "learning_rate": 0.0002, "epoch": 4.309895833333333, "step": 3310}, {"loss": 1.3403, "grad_norm": 0.8137491345405579, "learning_rate": 0.0002, "epoch": 4.322916666666667, "step": 3320}, {"loss": 1.3069, "grad_norm": 0.8136276006698608, "learning_rate": 0.0002, "epoch": 4.3359375, "step": 3330}, {"loss": 1.3512, "grad_norm": 0.7880964279174805, "learning_rate": 0.0002, "epoch": 4.348958333333333, "step": 3340}, {"loss": 1.3468, "grad_norm": 0.8654456734657288, "learning_rate": 0.0002, "epoch": 4.361979166666667, "step": 3350}, {"loss": 1.3036, "grad_norm": 0.8093366622924805, "learning_rate": 0.0002, "epoch": 4.375, "step": 3360}, {"loss": 1.3826, "grad_norm": 0.8738575577735901, "learning_rate": 0.0002, "epoch": 4.388020833333333, "step": 3370}, {"loss": 1.3485, "grad_norm": 0.8923026919364929, "learning_rate": 0.0002, "epoch": 4.401041666666667, "step": 3380}, {"loss": 1.3628, "grad_norm": 0.8508910536766052, "learning_rate": 0.0002, "epoch": 4.4140625, "step": 3390}, {"loss": 1.3048, "grad_norm": 0.8262084722518921, "learning_rate": 0.0002, "epoch": 4.427083333333333, "step": 3400}, {"loss": 1.3145, "grad_norm": 0.7843561768531799, "learning_rate": 0.0002, "epoch": 4.440104166666667, "step": 3410}, {"loss": 1.4526, "grad_norm": 0.9087795615196228, "learning_rate": 0.0002, "epoch": 4.453125, "step": 3420}, {"loss": 1.3492, "grad_norm": 0.8278809189796448, "learning_rate": 0.0002, "epoch": 4.466145833333333, "step": 3430}, {"loss": 1.3797, "grad_norm": 0.8337010741233826, "learning_rate": 0.0002, "epoch": 4.479166666666667, "step": 3440}, {"loss": 1.3199, "grad_norm": 0.7790088057518005, "learning_rate": 0.0002, "epoch": 4.4921875, "step": 3450}, {"loss": 1.3344, "grad_norm": 0.826231837272644, "learning_rate": 0.0002, "epoch": 4.505208333333333, "step": 3460}, {"loss": 1.3915, "grad_norm": 0.761461079120636, "learning_rate": 0.0002, "epoch": 4.518229166666667, "step": 3470}, {"loss": 1.2829, "grad_norm": 0.8892785906791687, "learning_rate": 0.0002, "epoch": 4.53125, "step": 3480}, {"loss": 1.3571, "grad_norm": 0.6087225675582886, "learning_rate": 0.0002, "epoch": 4.544270833333333, "step": 3490}, {"loss": 1.3167, "grad_norm": 0.8259274363517761, "learning_rate": 0.0002, "epoch": 4.557291666666667, "step": 3500}, {"loss": 1.3664, "grad_norm": 0.821164071559906, "learning_rate": 0.0002, "epoch": 4.5703125, "step": 3510}, {"loss": 1.2853, "grad_norm": 0.7262887954711914, "learning_rate": 0.0002, "epoch": 4.583333333333333, "step": 3520}, {"loss": 1.3777, "grad_norm": 0.8564826250076294, "learning_rate": 0.0002, "epoch": 4.596354166666667, "step": 3530}, {"loss": 1.3238, "grad_norm": 0.8072929978370667, "learning_rate": 0.0002, "epoch": 4.609375, "step": 3540}, {"loss": 1.43, "grad_norm": 0.8040832877159119, "learning_rate": 0.0002, "epoch": 4.622395833333333, "step": 3550}, {"loss": 1.2863, "grad_norm": 0.7268754839897156, "learning_rate": 0.0002, "epoch": 4.635416666666667, "step": 3560}, {"loss": 1.3485, "grad_norm": 0.9985134601593018, "learning_rate": 0.0002, "epoch": 4.6484375, "step": 3570}, {"loss": 1.3221, "grad_norm": 0.9826098680496216, "learning_rate": 0.0002, "epoch": 4.661458333333333, "step": 3580}, {"loss": 1.2878, "grad_norm": 0.8794422149658203, "learning_rate": 0.0002, "epoch": 4.674479166666667, "step": 3590}, {"loss": 1.3674, "grad_norm": 0.7207489609718323, "learning_rate": 0.0002, "epoch": 4.6875, "step": 3600}, {"loss": 1.3192, "grad_norm": 0.7546059489250183, "learning_rate": 0.0002, "epoch": 4.700520833333333, "step": 3610}, {"loss": 1.3445, "grad_norm": 0.8318526148796082, "learning_rate": 0.0002, "epoch": 4.713541666666667, "step": 3620}, {"loss": 1.3847, "grad_norm": 0.7529309391975403, "learning_rate": 0.0002, "epoch": 4.7265625, "step": 3630}, {"loss": 1.4208, "grad_norm": 0.7762532234191895, "learning_rate": 0.0002, "epoch": 4.739583333333333, "step": 3640}, {"loss": 1.4162, "grad_norm": 0.9306083917617798, "learning_rate": 0.0002, "epoch": 4.752604166666667, "step": 3650}, {"loss": 1.3828, "grad_norm": 0.8050256967544556, "learning_rate": 0.0002, "epoch": 4.765625, "step": 3660}, {"loss": 1.3671, "grad_norm": 0.8114449381828308, "learning_rate": 0.0002, "epoch": 4.778645833333333, "step": 3670}, {"loss": 1.3296, "grad_norm": 0.8125811815261841, "learning_rate": 0.0002, "epoch": 4.791666666666667, "step": 3680}, {"loss": 1.3222, "grad_norm": 0.7642565369606018, "learning_rate": 0.0002, "epoch": 4.8046875, "step": 3690}, {"loss": 1.2842, "grad_norm": 0.8970131874084473, "learning_rate": 0.0002, "epoch": 4.817708333333333, "step": 3700}, {"loss": 1.3983, "grad_norm": 0.7654327154159546, "learning_rate": 0.0002, "epoch": 4.830729166666667, "step": 3710}, {"loss": 1.3746, "grad_norm": 0.7605378031730652, "learning_rate": 0.0002, "epoch": 4.84375, "step": 3720}, {"loss": 1.3149, "grad_norm": 0.8340551257133484, "learning_rate": 0.0002, "epoch": 4.856770833333333, "step": 3730}, {"loss": 1.4309, "grad_norm": 0.7273691296577454, "learning_rate": 0.0002, "epoch": 4.869791666666667, "step": 3740}, {"loss": 1.3094, "grad_norm": 0.9718272686004639, "learning_rate": 0.0002, "epoch": 4.8828125, "step": 3750}, {"loss": 1.296, "grad_norm": 0.7891847491264343, "learning_rate": 0.0002, "epoch": 4.895833333333333, "step": 3760}, {"loss": 1.4613, "grad_norm": 0.9090818166732788, "learning_rate": 0.0002, "epoch": 4.908854166666667, "step": 3770}, {"loss": 1.3478, "grad_norm": 0.7963318824768066, "learning_rate": 0.0002, "epoch": 4.921875, "step": 3780}, {"loss": 1.3558, "grad_norm": 0.7588343620300293, "learning_rate": 0.0002, "epoch": 4.934895833333333, "step": 3790}, {"loss": 1.3664, "grad_norm": 0.84076327085495, "learning_rate": 0.0002, "epoch": 4.947916666666667, "step": 3800}, {"loss": 1.2836, "grad_norm": 0.7767227292060852, "learning_rate": 0.0002, "epoch": 4.9609375, "step": 3810}, {"loss": 1.3925, "grad_norm": 0.8101866245269775, "learning_rate": 0.0002, "epoch": 4.973958333333333, "step": 3820}, {"loss": 1.3881, "grad_norm": 0.7808696627616882, "learning_rate": 0.0002, "epoch": 4.986979166666667, "step": 3830}, {"loss": 1.4475, "grad_norm": 0.9609483480453491, "learning_rate": 0.0002, "epoch": 5.0, "step": 3840}]} +{"epoch": 6.0, "step": 4608, "epoch_duration": 1443.0496308803558, "total_accumulated_duration": 13566.04918885231, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}, {"eval_loss": 1.8215787410736084, "eval_runtime": 102.4906, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.634, "epoch": 1.0, "step": 768}, {"loss": 1.8072, "grad_norm": 0.303970068693161, "learning_rate": 0.0002, "epoch": 1.0026041666666667, "step": 770}, {"loss": 1.6708, "grad_norm": 0.32745876908302307, "learning_rate": 0.0002, "epoch": 1.015625, "step": 780}, {"loss": 1.623, "grad_norm": 0.33467888832092285, "learning_rate": 0.0002, "epoch": 1.0286458333333333, "step": 790}, {"loss": 1.746, "grad_norm": 0.38253068923950195, "learning_rate": 0.0002, "epoch": 1.0416666666666667, "step": 800}, {"loss": 1.685, "grad_norm": 0.3955802023410797, "learning_rate": 0.0002, "epoch": 1.0546875, "step": 810}, {"loss": 1.7395, "grad_norm": 0.3534117043018341, "learning_rate": 0.0002, "epoch": 1.0677083333333333, "step": 820}, {"loss": 1.6361, "grad_norm": 0.33427858352661133, "learning_rate": 0.0002, "epoch": 1.0807291666666667, "step": 830}, {"loss": 1.7435, "grad_norm": 0.35261571407318115, "learning_rate": 0.0002, "epoch": 1.09375, "step": 840}, {"loss": 1.7112, "grad_norm": 0.4416263997554779, "learning_rate": 0.0002, "epoch": 1.1067708333333333, "step": 850}, {"loss": 1.6311, "grad_norm": 0.3918050229549408, "learning_rate": 0.0002, "epoch": 1.1197916666666667, "step": 860}, {"loss": 1.6804, "grad_norm": 0.38482677936553955, "learning_rate": 0.0002, "epoch": 1.1328125, "step": 870}, {"loss": 1.6951, "grad_norm": 0.4945143759250641, "learning_rate": 0.0002, "epoch": 1.1458333333333333, "step": 880}, {"loss": 1.7577, "grad_norm": 0.429677814245224, "learning_rate": 0.0002, "epoch": 1.1588541666666667, "step": 890}, {"loss": 1.7204, "grad_norm": 0.41878288984298706, "learning_rate": 0.0002, "epoch": 1.171875, "step": 900}, {"loss": 1.717, "grad_norm": 0.41578373312950134, "learning_rate": 0.0002, "epoch": 1.1848958333333333, "step": 910}, {"loss": 1.7017, "grad_norm": 0.37028902769088745, "learning_rate": 0.0002, "epoch": 1.1979166666666667, "step": 920}, {"loss": 1.7074, "grad_norm": 0.3824995756149292, "learning_rate": 0.0002, "epoch": 1.2109375, "step": 930}, {"loss": 1.6185, "grad_norm": 0.3818865418434143, "learning_rate": 0.0002, "epoch": 1.2239583333333333, "step": 940}, {"loss": 1.7894, "grad_norm": 0.3930460810661316, "learning_rate": 0.0002, "epoch": 1.2369791666666667, "step": 950}, {"loss": 1.6766, "grad_norm": 0.3904426395893097, "learning_rate": 0.0002, "epoch": 1.25, "step": 960}, {"loss": 1.7072, "grad_norm": 0.4175802171230316, "learning_rate": 0.0002, "epoch": 1.2630208333333333, "step": 970}, {"loss": 1.7556, "grad_norm": 0.42343786358833313, "learning_rate": 0.0002, "epoch": 1.2760416666666667, "step": 980}, {"loss": 1.6339, "grad_norm": 0.4168420135974884, "learning_rate": 0.0002, "epoch": 1.2890625, "step": 990}, {"loss": 1.727, "grad_norm": 0.38692983984947205, "learning_rate": 0.0002, "epoch": 1.3020833333333333, "step": 1000}, {"loss": 1.6384, "grad_norm": 0.5037692189216614, "learning_rate": 0.0002, "epoch": 1.3151041666666667, "step": 1010}, {"loss": 1.6878, "grad_norm": 0.39436691999435425, "learning_rate": 0.0002, "epoch": 1.328125, "step": 1020}, {"loss": 1.7113, "grad_norm": 0.3431943356990814, "learning_rate": 0.0002, "epoch": 1.3411458333333333, "step": 1030}, {"loss": 1.7034, "grad_norm": 0.39167070388793945, "learning_rate": 0.0002, "epoch": 1.3541666666666667, "step": 1040}, {"loss": 1.7108, "grad_norm": 0.3820446729660034, "learning_rate": 0.0002, "epoch": 1.3671875, "step": 1050}, {"loss": 1.7885, "grad_norm": 0.4190749526023865, "learning_rate": 0.0002, "epoch": 1.3802083333333333, "step": 1060}, {"loss": 1.7548, "grad_norm": 0.3618869185447693, "learning_rate": 0.0002, "epoch": 1.3932291666666667, "step": 1070}, {"loss": 1.6199, "grad_norm": 0.38852423429489136, "learning_rate": 0.0002, "epoch": 1.40625, "step": 1080}, {"loss": 1.733, "grad_norm": 0.49829256534576416, "learning_rate": 0.0002, "epoch": 1.4192708333333333, "step": 1090}, {"loss": 1.6589, "grad_norm": 0.3956700563430786, "learning_rate": 0.0002, "epoch": 1.4322916666666667, "step": 1100}, {"loss": 1.5866, "grad_norm": 0.38829147815704346, "learning_rate": 0.0002, "epoch": 1.4453125, "step": 1110}, {"loss": 1.6709, "grad_norm": 0.37237483263015747, "learning_rate": 0.0002, "epoch": 1.4583333333333333, "step": 1120}, {"loss": 1.64, "grad_norm": 0.39798808097839355, "learning_rate": 0.0002, "epoch": 1.4713541666666667, "step": 1130}, {"loss": 1.7484, "grad_norm": 0.38188642263412476, "learning_rate": 0.0002, "epoch": 1.484375, "step": 1140}, {"loss": 1.6707, "grad_norm": 0.44961944222450256, "learning_rate": 0.0002, "epoch": 1.4973958333333333, "step": 1150}, {"loss": 1.6241, "grad_norm": 0.3816550374031067, "learning_rate": 0.0002, "epoch": 1.5104166666666665, "step": 1160}, {"loss": 1.7606, "grad_norm": 0.3885478973388672, "learning_rate": 0.0002, "epoch": 1.5234375, "step": 1170}, {"loss": 1.7285, "grad_norm": 0.42779695987701416, "learning_rate": 0.0002, "epoch": 1.5364583333333335, "step": 1180}, {"loss": 1.7399, "grad_norm": 0.41499748826026917, "learning_rate": 0.0002, "epoch": 1.5494791666666665, "step": 1190}, {"loss": 1.6569, "grad_norm": 0.4319412410259247, "learning_rate": 0.0002, "epoch": 1.5625, "step": 1200}, {"loss": 1.7297, "grad_norm": 0.38847389817237854, "learning_rate": 0.0002, "epoch": 1.5755208333333335, "step": 1210}, {"loss": 1.6666, "grad_norm": 0.45832890272140503, "learning_rate": 0.0002, "epoch": 1.5885416666666665, "step": 1220}, {"loss": 1.68, "grad_norm": 0.45928797125816345, "learning_rate": 0.0002, "epoch": 1.6015625, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.4052276611328125, "learning_rate": 0.0002, "epoch": 1.6145833333333335, "step": 1240}, {"loss": 1.6722, "grad_norm": 0.4031650424003601, "learning_rate": 0.0002, "epoch": 1.6276041666666665, "step": 1250}, {"loss": 1.7243, "grad_norm": 0.36724114418029785, "learning_rate": 0.0002, "epoch": 1.640625, "step": 1260}, {"loss": 1.7672, "grad_norm": 0.4188505709171295, "learning_rate": 0.0002, "epoch": 1.6536458333333335, "step": 1270}, {"loss": 1.7685, "grad_norm": 0.3982168138027191, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1280}, {"loss": 1.6831, "grad_norm": 0.3768596053123474, "learning_rate": 0.0002, "epoch": 1.6796875, "step": 1290}, {"loss": 1.6868, "grad_norm": 0.3843287527561188, "learning_rate": 0.0002, "epoch": 1.6927083333333335, "step": 1300}, {"loss": 1.6188, "grad_norm": 0.3982345461845398, "learning_rate": 0.0002, "epoch": 1.7057291666666665, "step": 1310}, {"loss": 1.7084, "grad_norm": 0.3407546281814575, "learning_rate": 0.0002, "epoch": 1.71875, "step": 1320}, {"loss": 1.7316, "grad_norm": 0.36327359080314636, "learning_rate": 0.0002, "epoch": 1.7317708333333335, "step": 1330}, {"loss": 1.734, "grad_norm": 0.4141675531864166, "learning_rate": 0.0002, "epoch": 1.7447916666666665, "step": 1340}, {"loss": 1.7257, "grad_norm": 0.43894267082214355, "learning_rate": 0.0002, "epoch": 1.7578125, "step": 1350}, {"loss": 1.6613, "grad_norm": 0.40564292669296265, "learning_rate": 0.0002, "epoch": 1.7708333333333335, "step": 1360}, {"loss": 1.6841, "grad_norm": 0.3978462815284729, "learning_rate": 0.0002, "epoch": 1.7838541666666665, "step": 1370}, {"loss": 1.6497, "grad_norm": 0.37140771746635437, "learning_rate": 0.0002, "epoch": 1.796875, "step": 1380}, {"loss": 1.742, "grad_norm": 0.43164145946502686, "learning_rate": 0.0002, "epoch": 1.8098958333333335, "step": 1390}, {"loss": 1.7253, "grad_norm": 0.38034674525260925, "learning_rate": 0.0002, "epoch": 1.8229166666666665, "step": 1400}, {"loss": 1.652, "grad_norm": 0.4235687851905823, "learning_rate": 0.0002, "epoch": 1.8359375, "step": 1410}, {"loss": 1.752, "grad_norm": 0.37417489290237427, "learning_rate": 0.0002, "epoch": 1.8489583333333335, "step": 1420}, {"loss": 1.6995, "grad_norm": 0.4303789734840393, "learning_rate": 0.0002, "epoch": 1.8619791666666665, "step": 1430}, {"loss": 1.6489, "grad_norm": 0.43942129611968994, "learning_rate": 0.0002, "epoch": 1.875, "step": 1440}, {"loss": 1.7989, "grad_norm": 0.3866581320762634, "learning_rate": 0.0002, "epoch": 1.8880208333333335, "step": 1450}, {"loss": 1.72, "grad_norm": 0.3686903417110443, "learning_rate": 0.0002, "epoch": 1.9010416666666665, "step": 1460}, {"loss": 1.6545, "grad_norm": 0.3885461986064911, "learning_rate": 0.0002, "epoch": 1.9140625, "step": 1470}, {"loss": 1.6981, "grad_norm": 0.4156927466392517, "learning_rate": 0.0002, "epoch": 1.9270833333333335, "step": 1480}, {"loss": 1.5921, "grad_norm": 0.3934236168861389, "learning_rate": 0.0002, "epoch": 1.9401041666666665, "step": 1490}, {"loss": 1.7384, "grad_norm": 0.38645586371421814, "learning_rate": 0.0002, "epoch": 1.953125, "step": 1500}, {"loss": 1.7033, "grad_norm": 0.43272635340690613, "learning_rate": 0.0002, "epoch": 1.9661458333333335, "step": 1510}, {"loss": 1.6138, "grad_norm": 0.42476025223731995, "learning_rate": 0.0002, "epoch": 1.9791666666666665, "step": 1520}, {"loss": 1.5834, "grad_norm": 0.37216147780418396, "learning_rate": 0.0002, "epoch": 1.9921875, "step": 1530}, {"eval_loss": 1.820037841796875, "eval_runtime": 101.0456, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.643, "epoch": 2.0, "step": 1536}, {"loss": 1.6395, "grad_norm": 0.39003029465675354, "learning_rate": 0.0002, "epoch": 2.0052083333333335, "step": 1540}, {"loss": 1.5447, "grad_norm": 0.4302637577056885, "learning_rate": 0.0002, "epoch": 2.0182291666666665, "step": 1550}, {"loss": 1.5951, "grad_norm": 0.4496043026447296, "learning_rate": 0.0002, "epoch": 2.03125, "step": 1560}, {"loss": 1.6032, "grad_norm": 0.42824679613113403, "learning_rate": 0.0002, "epoch": 2.0442708333333335, "step": 1570}, {"loss": 1.5996, "grad_norm": 0.44775739312171936, "learning_rate": 0.0002, "epoch": 2.0572916666666665, "step": 1580}, {"loss": 1.571, "grad_norm": 0.4705299735069275, "learning_rate": 0.0002, "epoch": 2.0703125, "step": 1590}, {"loss": 1.7589, "grad_norm": 0.4614814817905426, "learning_rate": 0.0002, "epoch": 2.0833333333333335, "step": 1600}, {"loss": 1.5762, "grad_norm": 0.45097213983535767, "learning_rate": 0.0002, "epoch": 2.0963541666666665, "step": 1610}, {"loss": 1.4947, "grad_norm": 0.41954323649406433, "learning_rate": 0.0002, "epoch": 2.109375, "step": 1620}, {"loss": 1.6397, "grad_norm": 0.44894352555274963, "learning_rate": 0.0002, "epoch": 2.1223958333333335, "step": 1630}, {"loss": 1.5251, "grad_norm": 0.4421502947807312, "learning_rate": 0.0002, "epoch": 2.1354166666666665, "step": 1640}, {"loss": 1.5931, "grad_norm": 0.44649967551231384, "learning_rate": 0.0002, "epoch": 2.1484375, "step": 1650}, {"loss": 1.6327, "grad_norm": 0.44216716289520264, "learning_rate": 0.0002, "epoch": 2.1614583333333335, "step": 1660}, {"loss": 1.5924, "grad_norm": 0.6363232135772705, "learning_rate": 0.0002, "epoch": 2.1744791666666665, "step": 1670}, {"loss": 1.6151, "grad_norm": 0.46533334255218506, "learning_rate": 0.0002, "epoch": 2.1875, "step": 1680}, {"loss": 1.5539, "grad_norm": 0.48486822843551636, "learning_rate": 0.0002, "epoch": 2.2005208333333335, "step": 1690}, {"loss": 1.6322, "grad_norm": 0.43277066946029663, "learning_rate": 0.0002, "epoch": 2.2135416666666665, "step": 1700}, {"loss": 1.4979, "grad_norm": 0.45927226543426514, "learning_rate": 0.0002, "epoch": 2.2265625, "step": 1710}, {"loss": 1.5917, "grad_norm": 0.4654010236263275, "learning_rate": 0.0002, "epoch": 2.2395833333333335, "step": 1720}, {"loss": 1.5713, "grad_norm": 0.49796584248542786, "learning_rate": 0.0002, "epoch": 2.2526041666666665, "step": 1730}, {"loss": 1.587, "grad_norm": 0.4506736397743225, "learning_rate": 0.0002, "epoch": 2.265625, "step": 1740}, {"loss": 1.5961, "grad_norm": 0.46757954359054565, "learning_rate": 0.0002, "epoch": 2.2786458333333335, "step": 1750}, {"loss": 1.6307, "grad_norm": 0.4507335424423218, "learning_rate": 0.0002, "epoch": 2.2916666666666665, "step": 1760}, {"loss": 1.5905, "grad_norm": 0.43900197744369507, "learning_rate": 0.0002, "epoch": 2.3046875, "step": 1770}, {"loss": 1.6655, "grad_norm": 0.48013004660606384, "learning_rate": 0.0002, "epoch": 2.3177083333333335, "step": 1780}, {"loss": 1.6024, "grad_norm": 0.41891220211982727, "learning_rate": 0.0002, "epoch": 2.3307291666666665, "step": 1790}, {"loss": 1.658, "grad_norm": 0.4879191219806671, "learning_rate": 0.0002, "epoch": 2.34375, "step": 1800}, {"loss": 1.6084, "grad_norm": 0.46148231625556946, "learning_rate": 0.0002, "epoch": 2.3567708333333335, "step": 1810}, {"loss": 1.6072, "grad_norm": 0.5114223957061768, "learning_rate": 0.0002, "epoch": 2.3697916666666665, "step": 1820}, {"loss": 1.5505, "grad_norm": 0.4828612804412842, "learning_rate": 0.0002, "epoch": 2.3828125, "step": 1830}, {"loss": 1.571, "grad_norm": 0.4672335386276245, "learning_rate": 0.0002, "epoch": 2.3958333333333335, "step": 1840}, {"loss": 1.6156, "grad_norm": 0.4914792776107788, "learning_rate": 0.0002, "epoch": 2.4088541666666665, "step": 1850}, {"loss": 1.5356, "grad_norm": 0.44478079676628113, "learning_rate": 0.0002, "epoch": 2.421875, "step": 1860}, {"loss": 1.7262, "grad_norm": 0.4601325988769531, "learning_rate": 0.0002, "epoch": 2.4348958333333335, "step": 1870}, {"loss": 1.555, "grad_norm": 0.44539815187454224, "learning_rate": 0.0002, "epoch": 2.4479166666666665, "step": 1880}, {"loss": 1.5877, "grad_norm": 0.4532422125339508, "learning_rate": 0.0002, "epoch": 2.4609375, "step": 1890}, {"loss": 1.5574, "grad_norm": 0.5323562622070312, "learning_rate": 0.0002, "epoch": 2.4739583333333335, "step": 1900}, {"loss": 1.7014, "grad_norm": 0.5027516484260559, "learning_rate": 0.0002, "epoch": 2.4869791666666665, "step": 1910}, {"loss": 1.5471, "grad_norm": 0.4507808983325958, "learning_rate": 0.0002, "epoch": 2.5, "step": 1920}, {"loss": 1.613, "grad_norm": 0.4996422827243805, "learning_rate": 0.0002, "epoch": 2.5130208333333335, "step": 1930}, {"loss": 1.6412, "grad_norm": 0.4964800179004669, "learning_rate": 0.0002, "epoch": 2.5260416666666665, "step": 1940}, {"loss": 1.547, "grad_norm": 0.48546481132507324, "learning_rate": 0.0002, "epoch": 2.5390625, "step": 1950}, {"loss": 1.6075, "grad_norm": 0.47357916831970215, "learning_rate": 0.0002, "epoch": 2.5520833333333335, "step": 1960}, {"loss": 1.5585, "grad_norm": 0.47136595845222473, "learning_rate": 0.0002, "epoch": 2.5651041666666665, "step": 1970}, {"loss": 1.5157, "grad_norm": 0.5185502171516418, "learning_rate": 0.0002, "epoch": 2.578125, "step": 1980}, {"loss": 1.6904, "grad_norm": 0.47995880246162415, "learning_rate": 0.0002, "epoch": 2.5911458333333335, "step": 1990}, {"loss": 1.638, "grad_norm": 0.5076674222946167, "learning_rate": 0.0002, "epoch": 2.6041666666666665, "step": 2000}, {"loss": 1.6038, "grad_norm": 0.4805421233177185, "learning_rate": 0.0002, "epoch": 2.6171875, "step": 2010}, {"loss": 1.6092, "grad_norm": 0.4406864047050476, "learning_rate": 0.0002, "epoch": 2.6302083333333335, "step": 2020}, {"loss": 1.6036, "grad_norm": 0.521388828754425, "learning_rate": 0.0002, "epoch": 2.6432291666666665, "step": 2030}, {"loss": 1.5338, "grad_norm": 0.4531918466091156, "learning_rate": 0.0002, "epoch": 2.65625, "step": 2040}, {"loss": 1.6853, "grad_norm": 0.45295774936676025, "learning_rate": 0.0002, "epoch": 2.6692708333333335, "step": 2050}, {"loss": 1.5252, "grad_norm": 0.4573723375797272, "learning_rate": 0.0002, "epoch": 2.6822916666666665, "step": 2060}, {"loss": 1.5765, "grad_norm": 0.4836064279079437, "learning_rate": 0.0002, "epoch": 2.6953125, "step": 2070}, {"loss": 1.5928, "grad_norm": 0.5040885210037231, "learning_rate": 0.0002, "epoch": 2.7083333333333335, "step": 2080}, {"loss": 1.6438, "grad_norm": 0.5153458118438721, "learning_rate": 0.0002, "epoch": 2.7213541666666665, "step": 2090}, {"loss": 1.5917, "grad_norm": 0.4415692090988159, "learning_rate": 0.0002, "epoch": 2.734375, "step": 2100}, {"loss": 1.6017, "grad_norm": 0.4862712621688843, "learning_rate": 0.0002, "epoch": 2.7473958333333335, "step": 2110}, {"loss": 1.5797, "grad_norm": 0.4845922589302063, "learning_rate": 0.0002, "epoch": 2.7604166666666665, "step": 2120}, {"loss": 1.6404, "grad_norm": 0.5153566598892212, "learning_rate": 0.0002, "epoch": 2.7734375, "step": 2130}, {"loss": 1.5609, "grad_norm": 0.4220491945743561, "learning_rate": 0.0002, "epoch": 2.7864583333333335, "step": 2140}, {"loss": 1.5404, "grad_norm": 0.523292064666748, "learning_rate": 0.0002, "epoch": 2.7994791666666665, "step": 2150}, {"loss": 1.4993, "grad_norm": 0.4567972421646118, "learning_rate": 0.0002, "epoch": 2.8125, "step": 2160}, {"loss": 1.6279, "grad_norm": 0.6252557039260864, "learning_rate": 0.0002, "epoch": 2.8255208333333335, "step": 2170}, {"loss": 1.6203, "grad_norm": 0.5231373310089111, "learning_rate": 0.0002, "epoch": 2.8385416666666665, "step": 2180}, {"loss": 1.5707, "grad_norm": 0.49243974685668945, "learning_rate": 0.0002, "epoch": 2.8515625, "step": 2190}, {"loss": 1.5923, "grad_norm": 0.521644115447998, "learning_rate": 0.0002, "epoch": 2.8645833333333335, "step": 2200}, {"loss": 1.6812, "grad_norm": 0.4624195694923401, "learning_rate": 0.0002, "epoch": 2.8776041666666665, "step": 2210}, {"loss": 1.6132, "grad_norm": 0.4463620185852051, "learning_rate": 0.0002, "epoch": 2.890625, "step": 2220}, {"loss": 1.6095, "grad_norm": 0.45793524384498596, "learning_rate": 0.0002, "epoch": 2.9036458333333335, "step": 2230}, {"loss": 1.5985, "grad_norm": 0.46979188919067383, "learning_rate": 0.0002, "epoch": 2.9166666666666665, "step": 2240}, {"loss": 1.617, "grad_norm": 0.5220303535461426, "learning_rate": 0.0002, "epoch": 2.9296875, "step": 2250}, {"loss": 1.5978, "grad_norm": 0.44405895471572876, "learning_rate": 0.0002, "epoch": 2.9427083333333335, "step": 2260}, {"loss": 1.6685, "grad_norm": 0.523841381072998, "learning_rate": 0.0002, "epoch": 2.9557291666666665, "step": 2270}, {"loss": 1.595, "grad_norm": 0.4928138852119446, "learning_rate": 0.0002, "epoch": 2.96875, "step": 2280}, {"loss": 1.606, "grad_norm": 0.4918071925640106, "learning_rate": 0.0002, "epoch": 2.9817708333333335, "step": 2290}, {"loss": 1.5736, "grad_norm": 0.4584912061691284, "learning_rate": 0.0002, "epoch": 2.9947916666666665, "step": 2300}, {"eval_loss": 1.8474308252334595, "eval_runtime": 103.7697, "eval_samples_per_second": 4.963, "eval_steps_per_second": 0.626, "epoch": 3.0, "step": 2304}, {"loss": 1.5454, "grad_norm": 0.4801871180534363, "learning_rate": 0.0002, "epoch": 3.0078125, "step": 2310}, {"loss": 1.4019, "grad_norm": 0.5789998173713684, "learning_rate": 0.0002, "epoch": 3.0208333333333335, "step": 2320}, {"loss": 1.4419, "grad_norm": 0.49856704473495483, "learning_rate": 0.0002, "epoch": 3.0338541666666665, "step": 2330}, {"loss": 1.4718, "grad_norm": 0.5625631213188171, "learning_rate": 0.0002, "epoch": 3.046875, "step": 2340}, {"loss": 1.4727, "grad_norm": 0.557637095451355, "learning_rate": 0.0002, "epoch": 3.0598958333333335, "step": 2350}, {"loss": 1.4654, "grad_norm": 0.528889536857605, "learning_rate": 0.0002, "epoch": 3.0729166666666665, "step": 2360}, {"loss": 1.4307, "grad_norm": 0.5952284932136536, "learning_rate": 0.0002, "epoch": 3.0859375, "step": 2370}, {"loss": 1.5304, "grad_norm": 0.5549899339675903, "learning_rate": 0.0002, "epoch": 3.0989583333333335, "step": 2380}, {"loss": 1.5034, "grad_norm": 0.662139892578125, "learning_rate": 0.0002, "epoch": 3.1119791666666665, "step": 2390}, {"loss": 1.4754, "grad_norm": 0.5281530618667603, "learning_rate": 0.0002, "epoch": 3.125, "step": 2400}, {"loss": 1.4047, "grad_norm": 0.6134106516838074, "learning_rate": 0.0002, "epoch": 3.1380208333333335, "step": 2410}, {"loss": 1.5001, "grad_norm": 0.6040887236595154, "learning_rate": 0.0002, "epoch": 3.1510416666666665, "step": 2420}, {"loss": 1.3936, "grad_norm": 0.549672544002533, "learning_rate": 0.0002, "epoch": 3.1640625, "step": 2430}, {"loss": 1.401, "grad_norm": 0.9195653796195984, "learning_rate": 0.0002, "epoch": 3.1770833333333335, "step": 2440}, {"loss": 1.507, "grad_norm": 0.5578703284263611, "learning_rate": 0.0002, "epoch": 3.1901041666666665, "step": 2450}, {"loss": 1.4873, "grad_norm": 0.5982925891876221, "learning_rate": 0.0002, "epoch": 3.203125, "step": 2460}, {"loss": 1.4909, "grad_norm": 0.5544393062591553, "learning_rate": 0.0002, "epoch": 3.2161458333333335, "step": 2470}, {"loss": 1.4705, "grad_norm": 0.6015266180038452, "learning_rate": 0.0002, "epoch": 3.2291666666666665, "step": 2480}, {"loss": 1.4652, "grad_norm": 0.5995243191719055, "learning_rate": 0.0002, "epoch": 3.2421875, "step": 2490}, {"loss": 1.4486, "grad_norm": 0.5846129059791565, "learning_rate": 0.0002, "epoch": 3.2552083333333335, "step": 2500}, {"loss": 1.4529, "grad_norm": 0.5552570223808289, "learning_rate": 0.0002, "epoch": 3.2682291666666665, "step": 2510}, {"loss": 1.3884, "grad_norm": 0.576998233795166, "learning_rate": 0.0002, "epoch": 3.28125, "step": 2520}, {"loss": 1.4463, "grad_norm": 0.6526138186454773, "learning_rate": 0.0002, "epoch": 3.2942708333333335, "step": 2530}, {"loss": 1.474, "grad_norm": 0.6064265966415405, "learning_rate": 0.0002, "epoch": 3.3072916666666665, "step": 2540}, {"loss": 1.5125, "grad_norm": 0.5542362928390503, "learning_rate": 0.0002, "epoch": 3.3203125, "step": 2550}, {"loss": 1.4769, "grad_norm": 0.6048482060432434, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2560}, {"loss": 1.4682, "grad_norm": 0.6328344941139221, "learning_rate": 0.0002, "epoch": 3.3463541666666665, "step": 2570}, {"loss": 1.5647, "grad_norm": 0.6347311735153198, "learning_rate": 0.0002, "epoch": 3.359375, "step": 2580}, {"loss": 1.5752, "grad_norm": 0.537570595741272, "learning_rate": 0.0002, "epoch": 3.3723958333333335, "step": 2590}, {"loss": 1.4086, "grad_norm": 0.5704807639122009, "learning_rate": 0.0002, "epoch": 3.3854166666666665, "step": 2600}, {"loss": 1.5653, "grad_norm": 0.5914373993873596, "learning_rate": 0.0002, "epoch": 3.3984375, "step": 2610}, {"loss": 1.4436, "grad_norm": 0.6724640130996704, "learning_rate": 0.0002, "epoch": 3.4114583333333335, "step": 2620}, {"loss": 1.5731, "grad_norm": 0.6295472383499146, "learning_rate": 0.0002, "epoch": 3.4244791666666665, "step": 2630}, {"loss": 1.4715, "grad_norm": 0.5842770934104919, "learning_rate": 0.0002, "epoch": 3.4375, "step": 2640}, {"loss": 1.451, "grad_norm": 0.6297776699066162, "learning_rate": 0.0002, "epoch": 3.4505208333333335, "step": 2650}, {"loss": 1.5761, "grad_norm": 0.6105847358703613, "learning_rate": 0.0002, "epoch": 3.4635416666666665, "step": 2660}, {"loss": 1.5332, "grad_norm": 0.6294940710067749, "learning_rate": 0.0002, "epoch": 3.4765625, "step": 2670}, {"loss": 1.5451, "grad_norm": 0.6573333740234375, "learning_rate": 0.0002, "epoch": 3.4895833333333335, "step": 2680}, {"loss": 1.4592, "grad_norm": 0.663661539554596, "learning_rate": 0.0002, "epoch": 3.5026041666666665, "step": 2690}, {"loss": 1.5286, "grad_norm": 0.6729148626327515, "learning_rate": 0.0002, "epoch": 3.515625, "step": 2700}, {"loss": 1.534, "grad_norm": 0.6633102893829346, "learning_rate": 0.0002, "epoch": 3.5286458333333335, "step": 2710}, {"loss": 1.4023, "grad_norm": 0.567686915397644, "learning_rate": 0.0002, "epoch": 3.5416666666666665, "step": 2720}, {"loss": 1.4925, "grad_norm": 0.6281962394714355, "learning_rate": 0.0002, "epoch": 3.5546875, "step": 2730}, {"loss": 1.5028, "grad_norm": 0.5710738897323608, "learning_rate": 0.0002, "epoch": 3.5677083333333335, "step": 2740}, {"loss": 1.4393, "grad_norm": 0.648162305355072, "learning_rate": 0.0002, "epoch": 3.5807291666666665, "step": 2750}, {"loss": 1.4294, "grad_norm": 0.5466254949569702, "learning_rate": 0.0002, "epoch": 3.59375, "step": 2760}, {"loss": 1.4993, "grad_norm": 0.6867973208427429, "learning_rate": 0.0002, "epoch": 3.6067708333333335, "step": 2770}, {"loss": 1.4463, "grad_norm": 0.673612117767334, "learning_rate": 0.0002, "epoch": 3.6197916666666665, "step": 2780}, {"loss": 1.5231, "grad_norm": 0.6928417086601257, "learning_rate": 0.0002, "epoch": 3.6328125, "step": 2790}, {"loss": 1.5212, "grad_norm": 0.6603742837905884, "learning_rate": 0.0002, "epoch": 3.6458333333333335, "step": 2800}, {"loss": 1.4889, "grad_norm": 0.5964401960372925, "learning_rate": 0.0002, "epoch": 3.6588541666666665, "step": 2810}, {"loss": 1.4585, "grad_norm": 0.6224474310874939, "learning_rate": 0.0002, "epoch": 3.671875, "step": 2820}, {"loss": 1.5119, "grad_norm": 0.6592439413070679, "learning_rate": 0.0002, "epoch": 3.6848958333333335, "step": 2830}, {"loss": 1.4729, "grad_norm": 0.6255369186401367, "learning_rate": 0.0002, "epoch": 3.6979166666666665, "step": 2840}, {"loss": 1.4598, "grad_norm": 0.7136337757110596, "learning_rate": 0.0002, "epoch": 3.7109375, "step": 2850}, {"loss": 1.4491, "grad_norm": 0.6229757070541382, "learning_rate": 0.0002, "epoch": 3.7239583333333335, "step": 2860}, {"loss": 1.4175, "grad_norm": 0.696080207824707, "learning_rate": 0.0002, "epoch": 3.7369791666666665, "step": 2870}, {"loss": 1.5127, "grad_norm": 0.571873664855957, "learning_rate": 0.0002, "epoch": 3.75, "step": 2880}, {"loss": 1.4093, "grad_norm": 0.5918916463851929, "learning_rate": 0.0002, "epoch": 3.7630208333333335, "step": 2890}, {"loss": 1.399, "grad_norm": 0.616413950920105, "learning_rate": 0.0002, "epoch": 3.7760416666666665, "step": 2900}, {"loss": 1.4215, "grad_norm": 0.6267292499542236, "learning_rate": 0.0002, "epoch": 3.7890625, "step": 2910}, {"loss": 1.5095, "grad_norm": 0.6630783677101135, "learning_rate": 0.0002, "epoch": 3.8020833333333335, "step": 2920}, {"loss": 1.5323, "grad_norm": 0.6004238724708557, "learning_rate": 0.0002, "epoch": 3.8151041666666665, "step": 2930}, {"loss": 1.4953, "grad_norm": 0.6740423440933228, "learning_rate": 0.0002, "epoch": 3.828125, "step": 2940}, {"loss": 1.549, "grad_norm": 0.6397785544395447, "learning_rate": 0.0002, "epoch": 3.8411458333333335, "step": 2950}, {"loss": 1.5309, "grad_norm": 0.6063735485076904, "learning_rate": 0.0002, "epoch": 3.8541666666666665, "step": 2960}, {"loss": 1.5093, "grad_norm": 0.6462053060531616, "learning_rate": 0.0002, "epoch": 3.8671875, "step": 2970}, {"loss": 1.5237, "grad_norm": 0.7143250107765198, "learning_rate": 0.0002, "epoch": 3.8802083333333335, "step": 2980}, {"loss": 1.4419, "grad_norm": 0.6747874617576599, "learning_rate": 0.0002, "epoch": 3.8932291666666665, "step": 2990}, {"loss": 1.5389, "grad_norm": 0.622930109500885, "learning_rate": 0.0002, "epoch": 3.90625, "step": 3000}, {"loss": 1.4279, "grad_norm": 0.620193600654602, "learning_rate": 0.0002, "epoch": 3.9192708333333335, "step": 3010}, {"loss": 1.495, "grad_norm": 0.6321487426757812, "learning_rate": 0.0002, "epoch": 3.9322916666666665, "step": 3020}, {"loss": 1.4657, "grad_norm": 0.5705523490905762, "learning_rate": 0.0002, "epoch": 3.9453125, "step": 3030}, {"loss": 1.4099, "grad_norm": 0.6185072660446167, "learning_rate": 0.0002, "epoch": 3.9583333333333335, "step": 3040}, {"loss": 1.4667, "grad_norm": 0.6005704998970032, "learning_rate": 0.0002, "epoch": 3.9713541666666665, "step": 3050}, {"loss": 1.4896, "grad_norm": 0.5933769941329956, "learning_rate": 0.0002, "epoch": 3.984375, "step": 3060}, {"loss": 1.4973, "grad_norm": 0.695209801197052, "learning_rate": 0.0002, "epoch": 3.9973958333333335, "step": 3070}, {"eval_loss": 1.8955267667770386, "eval_runtime": 103.5061, "eval_samples_per_second": 4.976, "eval_steps_per_second": 0.628, "epoch": 4.0, "step": 3072}, {"loss": 1.3502, "grad_norm": 0.6706188321113586, "learning_rate": 0.0002, "epoch": 4.010416666666667, "step": 3080}, {"loss": 1.2917, "grad_norm": 0.7263980507850647, "learning_rate": 0.0002, "epoch": 4.0234375, "step": 3090}, {"loss": 1.2845, "grad_norm": 0.7767240405082703, "learning_rate": 0.0002, "epoch": 4.036458333333333, "step": 3100}, {"loss": 1.4169, "grad_norm": 0.6888399124145508, "learning_rate": 0.0002, "epoch": 4.049479166666667, "step": 3110}, {"loss": 1.2422, "grad_norm": 0.8860331773757935, "learning_rate": 0.0002, "epoch": 4.0625, "step": 3120}, {"loss": 1.2842, "grad_norm": 0.7572373151779175, "learning_rate": 0.0002, "epoch": 4.075520833333333, "step": 3130}, {"loss": 1.2747, "grad_norm": 0.8321536183357239, "learning_rate": 0.0002, "epoch": 4.088541666666667, "step": 3140}, {"loss": 1.2843, "grad_norm": 0.7042664885520935, "learning_rate": 0.0002, "epoch": 4.1015625, "step": 3150}, {"loss": 1.3326, "grad_norm": 0.8910216689109802, "learning_rate": 0.0002, "epoch": 4.114583333333333, "step": 3160}, {"loss": 1.2742, "grad_norm": 0.8333232402801514, "learning_rate": 0.0002, "epoch": 4.127604166666667, "step": 3170}, {"loss": 1.2985, "grad_norm": 0.7120883464813232, "learning_rate": 0.0002, "epoch": 4.140625, "step": 3180}, {"loss": 1.3611, "grad_norm": 0.6904631853103638, "learning_rate": 0.0002, "epoch": 4.153645833333333, "step": 3190}, {"loss": 1.2881, "grad_norm": 0.6398878693580627, "learning_rate": 0.0002, "epoch": 4.166666666666667, "step": 3200}, {"loss": 1.3323, "grad_norm": 0.7573692798614502, "learning_rate": 0.0002, "epoch": 4.1796875, "step": 3210}, {"loss": 1.3509, "grad_norm": 0.7850743532180786, "learning_rate": 0.0002, "epoch": 4.192708333333333, "step": 3220}, {"loss": 1.3176, "grad_norm": 0.7863165736198425, "learning_rate": 0.0002, "epoch": 4.205729166666667, "step": 3230}, {"loss": 1.3739, "grad_norm": 0.7855865359306335, "learning_rate": 0.0002, "epoch": 4.21875, "step": 3240}, {"loss": 1.3251, "grad_norm": 0.6840922832489014, "learning_rate": 0.0002, "epoch": 4.231770833333333, "step": 3250}, {"loss": 1.32, "grad_norm": 0.8499747514724731, "learning_rate": 0.0002, "epoch": 4.244791666666667, "step": 3260}, {"loss": 1.4045, "grad_norm": 0.7982883453369141, "learning_rate": 0.0002, "epoch": 4.2578125, "step": 3270}, {"loss": 1.3922, "grad_norm": 0.7776934504508972, "learning_rate": 0.0002, "epoch": 4.270833333333333, "step": 3280}, {"loss": 1.309, "grad_norm": 0.8887693881988525, "learning_rate": 0.0002, "epoch": 4.283854166666667, "step": 3290}, {"loss": 1.3213, "grad_norm": 1.0184714794158936, "learning_rate": 0.0002, "epoch": 4.296875, "step": 3300}, {"loss": 1.3212, "grad_norm": 0.7539387345314026, "learning_rate": 0.0002, "epoch": 4.309895833333333, "step": 3310}, {"loss": 1.3403, "grad_norm": 0.8137491345405579, "learning_rate": 0.0002, "epoch": 4.322916666666667, "step": 3320}, {"loss": 1.3069, "grad_norm": 0.8136276006698608, "learning_rate": 0.0002, "epoch": 4.3359375, "step": 3330}, {"loss": 1.3512, "grad_norm": 0.7880964279174805, "learning_rate": 0.0002, "epoch": 4.348958333333333, "step": 3340}, {"loss": 1.3468, "grad_norm": 0.8654456734657288, "learning_rate": 0.0002, "epoch": 4.361979166666667, "step": 3350}, {"loss": 1.3036, "grad_norm": 0.8093366622924805, "learning_rate": 0.0002, "epoch": 4.375, "step": 3360}, {"loss": 1.3826, "grad_norm": 0.8738575577735901, "learning_rate": 0.0002, "epoch": 4.388020833333333, "step": 3370}, {"loss": 1.3485, "grad_norm": 0.8923026919364929, "learning_rate": 0.0002, "epoch": 4.401041666666667, "step": 3380}, {"loss": 1.3628, "grad_norm": 0.8508910536766052, "learning_rate": 0.0002, "epoch": 4.4140625, "step": 3390}, {"loss": 1.3048, "grad_norm": 0.8262084722518921, "learning_rate": 0.0002, "epoch": 4.427083333333333, "step": 3400}, {"loss": 1.3145, "grad_norm": 0.7843561768531799, "learning_rate": 0.0002, "epoch": 4.440104166666667, "step": 3410}, {"loss": 1.4526, "grad_norm": 0.9087795615196228, "learning_rate": 0.0002, "epoch": 4.453125, "step": 3420}, {"loss": 1.3492, "grad_norm": 0.8278809189796448, "learning_rate": 0.0002, "epoch": 4.466145833333333, "step": 3430}, {"loss": 1.3797, "grad_norm": 0.8337010741233826, "learning_rate": 0.0002, "epoch": 4.479166666666667, "step": 3440}, {"loss": 1.3199, "grad_norm": 0.7790088057518005, "learning_rate": 0.0002, "epoch": 4.4921875, "step": 3450}, {"loss": 1.3344, "grad_norm": 0.826231837272644, "learning_rate": 0.0002, "epoch": 4.505208333333333, "step": 3460}, {"loss": 1.3915, "grad_norm": 0.761461079120636, "learning_rate": 0.0002, "epoch": 4.518229166666667, "step": 3470}, {"loss": 1.2829, "grad_norm": 0.8892785906791687, "learning_rate": 0.0002, "epoch": 4.53125, "step": 3480}, {"loss": 1.3571, "grad_norm": 0.6087225675582886, "learning_rate": 0.0002, "epoch": 4.544270833333333, "step": 3490}, {"loss": 1.3167, "grad_norm": 0.8259274363517761, "learning_rate": 0.0002, "epoch": 4.557291666666667, "step": 3500}, {"loss": 1.3664, "grad_norm": 0.821164071559906, "learning_rate": 0.0002, "epoch": 4.5703125, "step": 3510}, {"loss": 1.2853, "grad_norm": 0.7262887954711914, "learning_rate": 0.0002, "epoch": 4.583333333333333, "step": 3520}, {"loss": 1.3777, "grad_norm": 0.8564826250076294, "learning_rate": 0.0002, "epoch": 4.596354166666667, "step": 3530}, {"loss": 1.3238, "grad_norm": 0.8072929978370667, "learning_rate": 0.0002, "epoch": 4.609375, "step": 3540}, {"loss": 1.43, "grad_norm": 0.8040832877159119, "learning_rate": 0.0002, "epoch": 4.622395833333333, "step": 3550}, {"loss": 1.2863, "grad_norm": 0.7268754839897156, "learning_rate": 0.0002, "epoch": 4.635416666666667, "step": 3560}, {"loss": 1.3485, "grad_norm": 0.9985134601593018, "learning_rate": 0.0002, "epoch": 4.6484375, "step": 3570}, {"loss": 1.3221, "grad_norm": 0.9826098680496216, "learning_rate": 0.0002, "epoch": 4.661458333333333, "step": 3580}, {"loss": 1.2878, "grad_norm": 0.8794422149658203, "learning_rate": 0.0002, "epoch": 4.674479166666667, "step": 3590}, {"loss": 1.3674, "grad_norm": 0.7207489609718323, "learning_rate": 0.0002, "epoch": 4.6875, "step": 3600}, {"loss": 1.3192, "grad_norm": 0.7546059489250183, "learning_rate": 0.0002, "epoch": 4.700520833333333, "step": 3610}, {"loss": 1.3445, "grad_norm": 0.8318526148796082, "learning_rate": 0.0002, "epoch": 4.713541666666667, "step": 3620}, {"loss": 1.3847, "grad_norm": 0.7529309391975403, "learning_rate": 0.0002, "epoch": 4.7265625, "step": 3630}, {"loss": 1.4208, "grad_norm": 0.7762532234191895, "learning_rate": 0.0002, "epoch": 4.739583333333333, "step": 3640}, {"loss": 1.4162, "grad_norm": 0.9306083917617798, "learning_rate": 0.0002, "epoch": 4.752604166666667, "step": 3650}, {"loss": 1.3828, "grad_norm": 0.8050256967544556, "learning_rate": 0.0002, "epoch": 4.765625, "step": 3660}, {"loss": 1.3671, "grad_norm": 0.8114449381828308, "learning_rate": 0.0002, "epoch": 4.778645833333333, "step": 3670}, {"loss": 1.3296, "grad_norm": 0.8125811815261841, "learning_rate": 0.0002, "epoch": 4.791666666666667, "step": 3680}, {"loss": 1.3222, "grad_norm": 0.7642565369606018, "learning_rate": 0.0002, "epoch": 4.8046875, "step": 3690}, {"loss": 1.2842, "grad_norm": 0.8970131874084473, "learning_rate": 0.0002, "epoch": 4.817708333333333, "step": 3700}, {"loss": 1.3983, "grad_norm": 0.7654327154159546, "learning_rate": 0.0002, "epoch": 4.830729166666667, "step": 3710}, {"loss": 1.3746, "grad_norm": 0.7605378031730652, "learning_rate": 0.0002, "epoch": 4.84375, "step": 3720}, {"loss": 1.3149, "grad_norm": 0.8340551257133484, "learning_rate": 0.0002, "epoch": 4.856770833333333, "step": 3730}, {"loss": 1.4309, "grad_norm": 0.7273691296577454, "learning_rate": 0.0002, "epoch": 4.869791666666667, "step": 3740}, {"loss": 1.3094, "grad_norm": 0.9718272686004639, "learning_rate": 0.0002, "epoch": 4.8828125, "step": 3750}, {"loss": 1.296, "grad_norm": 0.7891847491264343, "learning_rate": 0.0002, "epoch": 4.895833333333333, "step": 3760}, {"loss": 1.4613, "grad_norm": 0.9090818166732788, "learning_rate": 0.0002, "epoch": 4.908854166666667, "step": 3770}, {"loss": 1.3478, "grad_norm": 0.7963318824768066, "learning_rate": 0.0002, "epoch": 4.921875, "step": 3780}, {"loss": 1.3558, "grad_norm": 0.7588343620300293, "learning_rate": 0.0002, "epoch": 4.934895833333333, "step": 3790}, {"loss": 1.3664, "grad_norm": 0.84076327085495, "learning_rate": 0.0002, "epoch": 4.947916666666667, "step": 3800}, {"loss": 1.2836, "grad_norm": 0.7767227292060852, "learning_rate": 0.0002, "epoch": 4.9609375, "step": 3810}, {"loss": 1.3925, "grad_norm": 0.8101866245269775, "learning_rate": 0.0002, "epoch": 4.973958333333333, "step": 3820}, {"loss": 1.3881, "grad_norm": 0.7808696627616882, "learning_rate": 0.0002, "epoch": 4.986979166666667, "step": 3830}, {"loss": 1.4475, "grad_norm": 0.9609483480453491, "learning_rate": 0.0002, "epoch": 5.0, "step": 3840}, {"eval_loss": 1.9610719680786133, "eval_runtime": 87.6572, "eval_samples_per_second": 5.875, "eval_steps_per_second": 0.742, "epoch": 5.0, "step": 3840}, {"loss": 1.1603, "grad_norm": 0.9366803765296936, "learning_rate": 0.0002, "epoch": 5.013020833333333, "step": 3850}, {"loss": 1.1931, "grad_norm": 0.8014302849769592, "learning_rate": 0.0002, "epoch": 5.026041666666667, "step": 3860}, {"loss": 1.1418, "grad_norm": 0.977936863899231, "learning_rate": 0.0002, "epoch": 5.0390625, "step": 3870}, {"loss": 1.1258, "grad_norm": 1.045047640800476, "learning_rate": 0.0002, "epoch": 5.052083333333333, "step": 3880}, {"loss": 1.1709, "grad_norm": 1.125620722770691, "learning_rate": 0.0002, "epoch": 5.065104166666667, "step": 3890}, {"loss": 1.1954, "grad_norm": 1.1565124988555908, "learning_rate": 0.0002, "epoch": 5.078125, "step": 3900}, {"loss": 1.1753, "grad_norm": 1.102354884147644, "learning_rate": 0.0002, "epoch": 5.091145833333333, "step": 3910}, {"loss": 1.1632, "grad_norm": 0.9567629098892212, "learning_rate": 0.0002, "epoch": 5.104166666666667, "step": 3920}, {"loss": 1.1875, "grad_norm": 0.9760252833366394, "learning_rate": 0.0002, "epoch": 5.1171875, "step": 3930}, {"loss": 1.2289, "grad_norm": 1.026168704032898, "learning_rate": 0.0002, "epoch": 5.130208333333333, "step": 3940}, {"loss": 1.1598, "grad_norm": 1.1490436792373657, "learning_rate": 0.0002, "epoch": 5.143229166666667, "step": 3950}, {"loss": 1.0823, "grad_norm": 0.9712087512016296, "learning_rate": 0.0002, "epoch": 5.15625, "step": 3960}, {"loss": 1.1948, "grad_norm": 1.0095003843307495, "learning_rate": 0.0002, "epoch": 5.169270833333333, "step": 3970}, {"loss": 1.1617, "grad_norm": 0.9171855449676514, "learning_rate": 0.0002, "epoch": 5.182291666666667, "step": 3980}, {"loss": 1.161, "grad_norm": 1.0105657577514648, "learning_rate": 0.0002, "epoch": 5.1953125, "step": 3990}, {"loss": 1.2098, "grad_norm": 1.0330145359039307, "learning_rate": 0.0002, "epoch": 5.208333333333333, "step": 4000}, {"loss": 1.1965, "grad_norm": 1.0676906108856201, "learning_rate": 0.0002, "epoch": 5.221354166666667, "step": 4010}, {"loss": 1.1392, "grad_norm": 1.055088758468628, "learning_rate": 0.0002, "epoch": 5.234375, "step": 4020}, {"loss": 1.2173, "grad_norm": 0.9523683786392212, "learning_rate": 0.0002, "epoch": 5.247395833333333, "step": 4030}, {"loss": 1.1167, "grad_norm": 0.9013799428939819, "learning_rate": 0.0002, "epoch": 5.260416666666667, "step": 4040}, {"loss": 1.2274, "grad_norm": 0.9379037618637085, "learning_rate": 0.0002, "epoch": 5.2734375, "step": 4050}, {"loss": 1.1246, "grad_norm": 0.9565327763557434, "learning_rate": 0.0002, "epoch": 5.286458333333333, "step": 4060}, {"loss": 1.2103, "grad_norm": 1.1994404792785645, "learning_rate": 0.0002, "epoch": 5.299479166666667, "step": 4070}, {"loss": 1.2016, "grad_norm": 1.0563262701034546, "learning_rate": 0.0002, "epoch": 5.3125, "step": 4080}, {"loss": 1.2478, "grad_norm": 1.024290680885315, "learning_rate": 0.0002, "epoch": 5.325520833333333, "step": 4090}, {"loss": 1.2388, "grad_norm": 1.0022907257080078, "learning_rate": 0.0002, "epoch": 5.338541666666667, "step": 4100}, {"loss": 1.1948, "grad_norm": 0.9642180800437927, "learning_rate": 0.0002, "epoch": 5.3515625, "step": 4110}, {"loss": 1.231, "grad_norm": 1.0228009223937988, "learning_rate": 0.0002, "epoch": 5.364583333333333, "step": 4120}, {"loss": 1.2341, "grad_norm": 1.0379719734191895, "learning_rate": 0.0002, "epoch": 5.377604166666667, "step": 4130}, {"loss": 1.24, "grad_norm": 1.147053599357605, "learning_rate": 0.0002, "epoch": 5.390625, "step": 4140}, {"loss": 1.2026, "grad_norm": 1.2097876071929932, "learning_rate": 0.0002, "epoch": 5.403645833333333, "step": 4150}, {"loss": 1.1978, "grad_norm": 1.0852497816085815, "learning_rate": 0.0002, "epoch": 5.416666666666667, "step": 4160}, {"loss": 1.2182, "grad_norm": 0.9765135645866394, "learning_rate": 0.0002, "epoch": 5.4296875, "step": 4170}, {"loss": 1.3117, "grad_norm": 1.0180606842041016, "learning_rate": 0.0002, "epoch": 5.442708333333333, "step": 4180}, {"loss": 1.2355, "grad_norm": 1.185409665107727, "learning_rate": 0.0002, "epoch": 5.455729166666667, "step": 4190}, {"loss": 1.1531, "grad_norm": 0.9363358020782471, "learning_rate": 0.0002, "epoch": 5.46875, "step": 4200}, {"loss": 1.1645, "grad_norm": 1.0761215686798096, "learning_rate": 0.0002, "epoch": 5.481770833333333, "step": 4210}, {"loss": 1.1465, "grad_norm": 1.057626724243164, "learning_rate": 0.0002, "epoch": 5.494791666666667, "step": 4220}, {"loss": 1.2051, "grad_norm": 1.0103157758712769, "learning_rate": 0.0002, "epoch": 5.5078125, "step": 4230}, {"loss": 1.2193, "grad_norm": 1.1056627035140991, "learning_rate": 0.0002, "epoch": 5.520833333333333, "step": 4240}, {"loss": 1.1941, "grad_norm": 1.0256257057189941, "learning_rate": 0.0002, "epoch": 5.533854166666667, "step": 4250}, {"loss": 1.1724, "grad_norm": 1.2814106941223145, "learning_rate": 0.0002, "epoch": 5.546875, "step": 4260}, {"loss": 1.1676, "grad_norm": 0.9044927954673767, "learning_rate": 0.0002, "epoch": 5.559895833333333, "step": 4270}, {"loss": 1.2448, "grad_norm": 0.9870165586471558, "learning_rate": 0.0002, "epoch": 5.572916666666667, "step": 4280}, {"loss": 1.2414, "grad_norm": 0.9867369532585144, "learning_rate": 0.0002, "epoch": 5.5859375, "step": 4290}, {"loss": 1.2115, "grad_norm": 1.045625925064087, "learning_rate": 0.0002, "epoch": 5.598958333333333, "step": 4300}, {"loss": 1.2786, "grad_norm": 0.979853630065918, "learning_rate": 0.0002, "epoch": 5.611979166666667, "step": 4310}, {"loss": 1.1629, "grad_norm": 1.029212236404419, "learning_rate": 0.0002, "epoch": 5.625, "step": 4320}, {"loss": 1.1985, "grad_norm": 1.0348633527755737, "learning_rate": 0.0002, "epoch": 5.638020833333333, "step": 4330}, {"loss": 1.1914, "grad_norm": 1.0055185556411743, "learning_rate": 0.0002, "epoch": 5.651041666666667, "step": 4340}, {"loss": 1.2658, "grad_norm": 0.9312447309494019, "learning_rate": 0.0002, "epoch": 5.6640625, "step": 4350}, {"loss": 1.1901, "grad_norm": 1.1411694288253784, "learning_rate": 0.0002, "epoch": 5.677083333333333, "step": 4360}, {"loss": 1.2679, "grad_norm": 0.9764434695243835, "learning_rate": 0.0002, "epoch": 5.690104166666667, "step": 4370}, {"loss": 1.2215, "grad_norm": 1.079154133796692, "learning_rate": 0.0002, "epoch": 5.703125, "step": 4380}, {"loss": 1.1659, "grad_norm": 0.999526858329773, "learning_rate": 0.0002, "epoch": 5.716145833333333, "step": 4390}, {"loss": 1.1685, "grad_norm": 1.1239734888076782, "learning_rate": 0.0002, "epoch": 5.729166666666667, "step": 4400}, {"loss": 1.1126, "grad_norm": 1.0539512634277344, "learning_rate": 0.0002, "epoch": 5.7421875, "step": 4410}, {"loss": 1.1413, "grad_norm": 0.9884052872657776, "learning_rate": 0.0002, "epoch": 5.755208333333333, "step": 4420}, {"loss": 1.1781, "grad_norm": 0.9821958541870117, "learning_rate": 0.0002, "epoch": 5.768229166666667, "step": 4430}, {"loss": 1.2319, "grad_norm": 0.9340839982032776, "learning_rate": 0.0002, "epoch": 5.78125, "step": 4440}, {"loss": 1.3085, "grad_norm": 0.9935781955718994, "learning_rate": 0.0002, "epoch": 5.794270833333333, "step": 4450}, {"loss": 1.1726, "grad_norm": 1.1027121543884277, "learning_rate": 0.0002, "epoch": 5.807291666666667, "step": 4460}, {"loss": 1.2385, "grad_norm": 0.9388337135314941, "learning_rate": 0.0002, "epoch": 5.8203125, "step": 4470}, {"loss": 1.259, "grad_norm": 1.0957310199737549, "learning_rate": 0.0002, "epoch": 5.833333333333333, "step": 4480}, {"loss": 1.3017, "grad_norm": 1.0832754373550415, "learning_rate": 0.0002, "epoch": 5.846354166666667, "step": 4490}, {"loss": 1.1724, "grad_norm": 0.9498379826545715, "learning_rate": 0.0002, "epoch": 5.859375, "step": 4500}, {"loss": 1.2312, "grad_norm": 0.9104725122451782, "learning_rate": 0.0002, "epoch": 5.872395833333333, "step": 4510}, {"loss": 1.204, "grad_norm": 1.2238177061080933, "learning_rate": 0.0002, "epoch": 5.885416666666667, "step": 4520}, {"loss": 1.2163, "grad_norm": 1.0549527406692505, "learning_rate": 0.0002, "epoch": 5.8984375, "step": 4530}, {"loss": 1.3086, "grad_norm": 1.0415066480636597, "learning_rate": 0.0002, "epoch": 5.911458333333333, "step": 4540}, {"loss": 1.1744, "grad_norm": 0.9098646640777588, "learning_rate": 0.0002, "epoch": 5.924479166666667, "step": 4550}, {"loss": 1.2126, "grad_norm": 0.9182857275009155, "learning_rate": 0.0002, "epoch": 5.9375, "step": 4560}, {"loss": 1.2341, "grad_norm": 1.088038444519043, "learning_rate": 0.0002, "epoch": 5.950520833333333, "step": 4570}, {"loss": 1.2317, "grad_norm": 1.1331020593643188, "learning_rate": 0.0002, "epoch": 5.963541666666667, "step": 4580}, {"loss": 1.2318, "grad_norm": 0.9592235088348389, "learning_rate": 0.0002, "epoch": 5.9765625, "step": 4590}, {"loss": 1.1995, "grad_norm": 1.0126368999481201, "learning_rate": 0.0002, "epoch": 5.989583333333333, "step": 4600}]} +{"epoch": 7.0, "step": 5376, "epoch_duration": 923.8996188640594, "total_accumulated_duration": 14489.94880771637, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}, {"eval_loss": 1.8215787410736084, "eval_runtime": 102.4906, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.634, "epoch": 1.0, "step": 768}, {"loss": 1.8072, "grad_norm": 0.303970068693161, "learning_rate": 0.0002, "epoch": 1.0026041666666667, "step": 770}, {"loss": 1.6708, "grad_norm": 0.32745876908302307, "learning_rate": 0.0002, "epoch": 1.015625, "step": 780}, {"loss": 1.623, "grad_norm": 0.33467888832092285, "learning_rate": 0.0002, "epoch": 1.0286458333333333, "step": 790}, {"loss": 1.746, "grad_norm": 0.38253068923950195, "learning_rate": 0.0002, "epoch": 1.0416666666666667, "step": 800}, {"loss": 1.685, "grad_norm": 0.3955802023410797, "learning_rate": 0.0002, "epoch": 1.0546875, "step": 810}, {"loss": 1.7395, "grad_norm": 0.3534117043018341, "learning_rate": 0.0002, "epoch": 1.0677083333333333, "step": 820}, {"loss": 1.6361, "grad_norm": 0.33427858352661133, "learning_rate": 0.0002, "epoch": 1.0807291666666667, "step": 830}, {"loss": 1.7435, "grad_norm": 0.35261571407318115, "learning_rate": 0.0002, "epoch": 1.09375, "step": 840}, {"loss": 1.7112, "grad_norm": 0.4416263997554779, "learning_rate": 0.0002, "epoch": 1.1067708333333333, "step": 850}, {"loss": 1.6311, "grad_norm": 0.3918050229549408, "learning_rate": 0.0002, "epoch": 1.1197916666666667, "step": 860}, {"loss": 1.6804, "grad_norm": 0.38482677936553955, "learning_rate": 0.0002, "epoch": 1.1328125, "step": 870}, {"loss": 1.6951, "grad_norm": 0.4945143759250641, "learning_rate": 0.0002, "epoch": 1.1458333333333333, "step": 880}, {"loss": 1.7577, "grad_norm": 0.429677814245224, "learning_rate": 0.0002, "epoch": 1.1588541666666667, "step": 890}, {"loss": 1.7204, "grad_norm": 0.41878288984298706, "learning_rate": 0.0002, "epoch": 1.171875, "step": 900}, {"loss": 1.717, "grad_norm": 0.41578373312950134, "learning_rate": 0.0002, "epoch": 1.1848958333333333, "step": 910}, {"loss": 1.7017, "grad_norm": 0.37028902769088745, "learning_rate": 0.0002, "epoch": 1.1979166666666667, "step": 920}, {"loss": 1.7074, "grad_norm": 0.3824995756149292, "learning_rate": 0.0002, "epoch": 1.2109375, "step": 930}, {"loss": 1.6185, "grad_norm": 0.3818865418434143, "learning_rate": 0.0002, "epoch": 1.2239583333333333, "step": 940}, {"loss": 1.7894, "grad_norm": 0.3930460810661316, "learning_rate": 0.0002, "epoch": 1.2369791666666667, "step": 950}, {"loss": 1.6766, "grad_norm": 0.3904426395893097, "learning_rate": 0.0002, "epoch": 1.25, "step": 960}, {"loss": 1.7072, "grad_norm": 0.4175802171230316, "learning_rate": 0.0002, "epoch": 1.2630208333333333, "step": 970}, {"loss": 1.7556, "grad_norm": 0.42343786358833313, "learning_rate": 0.0002, "epoch": 1.2760416666666667, "step": 980}, {"loss": 1.6339, "grad_norm": 0.4168420135974884, "learning_rate": 0.0002, "epoch": 1.2890625, "step": 990}, {"loss": 1.727, "grad_norm": 0.38692983984947205, "learning_rate": 0.0002, "epoch": 1.3020833333333333, "step": 1000}, {"loss": 1.6384, "grad_norm": 0.5037692189216614, "learning_rate": 0.0002, "epoch": 1.3151041666666667, "step": 1010}, {"loss": 1.6878, "grad_norm": 0.39436691999435425, "learning_rate": 0.0002, "epoch": 1.328125, "step": 1020}, {"loss": 1.7113, "grad_norm": 0.3431943356990814, "learning_rate": 0.0002, "epoch": 1.3411458333333333, "step": 1030}, {"loss": 1.7034, "grad_norm": 0.39167070388793945, "learning_rate": 0.0002, "epoch": 1.3541666666666667, "step": 1040}, {"loss": 1.7108, "grad_norm": 0.3820446729660034, "learning_rate": 0.0002, "epoch": 1.3671875, "step": 1050}, {"loss": 1.7885, "grad_norm": 0.4190749526023865, "learning_rate": 0.0002, "epoch": 1.3802083333333333, "step": 1060}, {"loss": 1.7548, "grad_norm": 0.3618869185447693, "learning_rate": 0.0002, "epoch": 1.3932291666666667, "step": 1070}, {"loss": 1.6199, "grad_norm": 0.38852423429489136, "learning_rate": 0.0002, "epoch": 1.40625, "step": 1080}, {"loss": 1.733, "grad_norm": 0.49829256534576416, "learning_rate": 0.0002, "epoch": 1.4192708333333333, "step": 1090}, {"loss": 1.6589, "grad_norm": 0.3956700563430786, "learning_rate": 0.0002, "epoch": 1.4322916666666667, "step": 1100}, {"loss": 1.5866, "grad_norm": 0.38829147815704346, "learning_rate": 0.0002, "epoch": 1.4453125, "step": 1110}, {"loss": 1.6709, "grad_norm": 0.37237483263015747, "learning_rate": 0.0002, "epoch": 1.4583333333333333, "step": 1120}, {"loss": 1.64, "grad_norm": 0.39798808097839355, "learning_rate": 0.0002, "epoch": 1.4713541666666667, "step": 1130}, {"loss": 1.7484, "grad_norm": 0.38188642263412476, "learning_rate": 0.0002, "epoch": 1.484375, "step": 1140}, {"loss": 1.6707, "grad_norm": 0.44961944222450256, "learning_rate": 0.0002, "epoch": 1.4973958333333333, "step": 1150}, {"loss": 1.6241, "grad_norm": 0.3816550374031067, "learning_rate": 0.0002, "epoch": 1.5104166666666665, "step": 1160}, {"loss": 1.7606, "grad_norm": 0.3885478973388672, "learning_rate": 0.0002, "epoch": 1.5234375, "step": 1170}, {"loss": 1.7285, "grad_norm": 0.42779695987701416, "learning_rate": 0.0002, "epoch": 1.5364583333333335, "step": 1180}, {"loss": 1.7399, "grad_norm": 0.41499748826026917, "learning_rate": 0.0002, "epoch": 1.5494791666666665, "step": 1190}, {"loss": 1.6569, "grad_norm": 0.4319412410259247, "learning_rate": 0.0002, "epoch": 1.5625, "step": 1200}, {"loss": 1.7297, "grad_norm": 0.38847389817237854, "learning_rate": 0.0002, "epoch": 1.5755208333333335, "step": 1210}, {"loss": 1.6666, "grad_norm": 0.45832890272140503, "learning_rate": 0.0002, "epoch": 1.5885416666666665, "step": 1220}, {"loss": 1.68, "grad_norm": 0.45928797125816345, "learning_rate": 0.0002, "epoch": 1.6015625, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.4052276611328125, "learning_rate": 0.0002, "epoch": 1.6145833333333335, "step": 1240}, {"loss": 1.6722, "grad_norm": 0.4031650424003601, "learning_rate": 0.0002, "epoch": 1.6276041666666665, "step": 1250}, {"loss": 1.7243, "grad_norm": 0.36724114418029785, "learning_rate": 0.0002, "epoch": 1.640625, "step": 1260}, {"loss": 1.7672, "grad_norm": 0.4188505709171295, "learning_rate": 0.0002, "epoch": 1.6536458333333335, "step": 1270}, {"loss": 1.7685, "grad_norm": 0.3982168138027191, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1280}, {"loss": 1.6831, "grad_norm": 0.3768596053123474, "learning_rate": 0.0002, "epoch": 1.6796875, "step": 1290}, {"loss": 1.6868, "grad_norm": 0.3843287527561188, "learning_rate": 0.0002, "epoch": 1.6927083333333335, "step": 1300}, {"loss": 1.6188, "grad_norm": 0.3982345461845398, "learning_rate": 0.0002, "epoch": 1.7057291666666665, "step": 1310}, {"loss": 1.7084, "grad_norm": 0.3407546281814575, "learning_rate": 0.0002, "epoch": 1.71875, "step": 1320}, {"loss": 1.7316, "grad_norm": 0.36327359080314636, "learning_rate": 0.0002, "epoch": 1.7317708333333335, "step": 1330}, {"loss": 1.734, "grad_norm": 0.4141675531864166, "learning_rate": 0.0002, "epoch": 1.7447916666666665, "step": 1340}, {"loss": 1.7257, "grad_norm": 0.43894267082214355, "learning_rate": 0.0002, "epoch": 1.7578125, "step": 1350}, {"loss": 1.6613, "grad_norm": 0.40564292669296265, "learning_rate": 0.0002, "epoch": 1.7708333333333335, "step": 1360}, {"loss": 1.6841, "grad_norm": 0.3978462815284729, "learning_rate": 0.0002, "epoch": 1.7838541666666665, "step": 1370}, {"loss": 1.6497, "grad_norm": 0.37140771746635437, "learning_rate": 0.0002, "epoch": 1.796875, "step": 1380}, {"loss": 1.742, "grad_norm": 0.43164145946502686, "learning_rate": 0.0002, "epoch": 1.8098958333333335, "step": 1390}, {"loss": 1.7253, "grad_norm": 0.38034674525260925, "learning_rate": 0.0002, "epoch": 1.8229166666666665, "step": 1400}, {"loss": 1.652, "grad_norm": 0.4235687851905823, "learning_rate": 0.0002, "epoch": 1.8359375, "step": 1410}, {"loss": 1.752, "grad_norm": 0.37417489290237427, "learning_rate": 0.0002, "epoch": 1.8489583333333335, "step": 1420}, {"loss": 1.6995, "grad_norm": 0.4303789734840393, "learning_rate": 0.0002, "epoch": 1.8619791666666665, "step": 1430}, {"loss": 1.6489, "grad_norm": 0.43942129611968994, "learning_rate": 0.0002, "epoch": 1.875, "step": 1440}, {"loss": 1.7989, "grad_norm": 0.3866581320762634, "learning_rate": 0.0002, "epoch": 1.8880208333333335, "step": 1450}, {"loss": 1.72, "grad_norm": 0.3686903417110443, "learning_rate": 0.0002, "epoch": 1.9010416666666665, "step": 1460}, {"loss": 1.6545, "grad_norm": 0.3885461986064911, "learning_rate": 0.0002, "epoch": 1.9140625, "step": 1470}, {"loss": 1.6981, "grad_norm": 0.4156927466392517, "learning_rate": 0.0002, "epoch": 1.9270833333333335, "step": 1480}, {"loss": 1.5921, "grad_norm": 0.3934236168861389, "learning_rate": 0.0002, "epoch": 1.9401041666666665, "step": 1490}, {"loss": 1.7384, "grad_norm": 0.38645586371421814, "learning_rate": 0.0002, "epoch": 1.953125, "step": 1500}, {"loss": 1.7033, "grad_norm": 0.43272635340690613, "learning_rate": 0.0002, "epoch": 1.9661458333333335, "step": 1510}, {"loss": 1.6138, "grad_norm": 0.42476025223731995, "learning_rate": 0.0002, "epoch": 1.9791666666666665, "step": 1520}, {"loss": 1.5834, "grad_norm": 0.37216147780418396, "learning_rate": 0.0002, "epoch": 1.9921875, "step": 1530}, {"eval_loss": 1.820037841796875, "eval_runtime": 101.0456, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.643, "epoch": 2.0, "step": 1536}, {"loss": 1.6395, "grad_norm": 0.39003029465675354, "learning_rate": 0.0002, "epoch": 2.0052083333333335, "step": 1540}, {"loss": 1.5447, "grad_norm": 0.4302637577056885, "learning_rate": 0.0002, "epoch": 2.0182291666666665, "step": 1550}, {"loss": 1.5951, "grad_norm": 0.4496043026447296, "learning_rate": 0.0002, "epoch": 2.03125, "step": 1560}, {"loss": 1.6032, "grad_norm": 0.42824679613113403, "learning_rate": 0.0002, "epoch": 2.0442708333333335, "step": 1570}, {"loss": 1.5996, "grad_norm": 0.44775739312171936, "learning_rate": 0.0002, "epoch": 2.0572916666666665, "step": 1580}, {"loss": 1.571, "grad_norm": 0.4705299735069275, "learning_rate": 0.0002, "epoch": 2.0703125, "step": 1590}, {"loss": 1.7589, "grad_norm": 0.4614814817905426, "learning_rate": 0.0002, "epoch": 2.0833333333333335, "step": 1600}, {"loss": 1.5762, "grad_norm": 0.45097213983535767, "learning_rate": 0.0002, "epoch": 2.0963541666666665, "step": 1610}, {"loss": 1.4947, "grad_norm": 0.41954323649406433, "learning_rate": 0.0002, "epoch": 2.109375, "step": 1620}, {"loss": 1.6397, "grad_norm": 0.44894352555274963, "learning_rate": 0.0002, "epoch": 2.1223958333333335, "step": 1630}, {"loss": 1.5251, "grad_norm": 0.4421502947807312, "learning_rate": 0.0002, "epoch": 2.1354166666666665, "step": 1640}, {"loss": 1.5931, "grad_norm": 0.44649967551231384, "learning_rate": 0.0002, "epoch": 2.1484375, "step": 1650}, {"loss": 1.6327, "grad_norm": 0.44216716289520264, "learning_rate": 0.0002, "epoch": 2.1614583333333335, "step": 1660}, {"loss": 1.5924, "grad_norm": 0.6363232135772705, "learning_rate": 0.0002, "epoch": 2.1744791666666665, "step": 1670}, {"loss": 1.6151, "grad_norm": 0.46533334255218506, "learning_rate": 0.0002, "epoch": 2.1875, "step": 1680}, {"loss": 1.5539, "grad_norm": 0.48486822843551636, "learning_rate": 0.0002, "epoch": 2.2005208333333335, "step": 1690}, {"loss": 1.6322, "grad_norm": 0.43277066946029663, "learning_rate": 0.0002, "epoch": 2.2135416666666665, "step": 1700}, {"loss": 1.4979, "grad_norm": 0.45927226543426514, "learning_rate": 0.0002, "epoch": 2.2265625, "step": 1710}, {"loss": 1.5917, "grad_norm": 0.4654010236263275, "learning_rate": 0.0002, "epoch": 2.2395833333333335, "step": 1720}, {"loss": 1.5713, "grad_norm": 0.49796584248542786, "learning_rate": 0.0002, "epoch": 2.2526041666666665, "step": 1730}, {"loss": 1.587, "grad_norm": 0.4506736397743225, "learning_rate": 0.0002, "epoch": 2.265625, "step": 1740}, {"loss": 1.5961, "grad_norm": 0.46757954359054565, "learning_rate": 0.0002, "epoch": 2.2786458333333335, "step": 1750}, {"loss": 1.6307, "grad_norm": 0.4507335424423218, "learning_rate": 0.0002, "epoch": 2.2916666666666665, "step": 1760}, {"loss": 1.5905, "grad_norm": 0.43900197744369507, "learning_rate": 0.0002, "epoch": 2.3046875, "step": 1770}, {"loss": 1.6655, "grad_norm": 0.48013004660606384, "learning_rate": 0.0002, "epoch": 2.3177083333333335, "step": 1780}, {"loss": 1.6024, "grad_norm": 0.41891220211982727, "learning_rate": 0.0002, "epoch": 2.3307291666666665, "step": 1790}, {"loss": 1.658, "grad_norm": 0.4879191219806671, "learning_rate": 0.0002, "epoch": 2.34375, "step": 1800}, {"loss": 1.6084, "grad_norm": 0.46148231625556946, "learning_rate": 0.0002, "epoch": 2.3567708333333335, "step": 1810}, {"loss": 1.6072, "grad_norm": 0.5114223957061768, "learning_rate": 0.0002, "epoch": 2.3697916666666665, "step": 1820}, {"loss": 1.5505, "grad_norm": 0.4828612804412842, "learning_rate": 0.0002, "epoch": 2.3828125, "step": 1830}, {"loss": 1.571, "grad_norm": 0.4672335386276245, "learning_rate": 0.0002, "epoch": 2.3958333333333335, "step": 1840}, {"loss": 1.6156, "grad_norm": 0.4914792776107788, "learning_rate": 0.0002, "epoch": 2.4088541666666665, "step": 1850}, {"loss": 1.5356, "grad_norm": 0.44478079676628113, "learning_rate": 0.0002, "epoch": 2.421875, "step": 1860}, {"loss": 1.7262, "grad_norm": 0.4601325988769531, "learning_rate": 0.0002, "epoch": 2.4348958333333335, "step": 1870}, {"loss": 1.555, "grad_norm": 0.44539815187454224, "learning_rate": 0.0002, "epoch": 2.4479166666666665, "step": 1880}, {"loss": 1.5877, "grad_norm": 0.4532422125339508, "learning_rate": 0.0002, "epoch": 2.4609375, "step": 1890}, {"loss": 1.5574, "grad_norm": 0.5323562622070312, "learning_rate": 0.0002, "epoch": 2.4739583333333335, "step": 1900}, {"loss": 1.7014, "grad_norm": 0.5027516484260559, "learning_rate": 0.0002, "epoch": 2.4869791666666665, "step": 1910}, {"loss": 1.5471, "grad_norm": 0.4507808983325958, "learning_rate": 0.0002, "epoch": 2.5, "step": 1920}, {"loss": 1.613, "grad_norm": 0.4996422827243805, "learning_rate": 0.0002, "epoch": 2.5130208333333335, "step": 1930}, {"loss": 1.6412, "grad_norm": 0.4964800179004669, "learning_rate": 0.0002, "epoch": 2.5260416666666665, "step": 1940}, {"loss": 1.547, "grad_norm": 0.48546481132507324, "learning_rate": 0.0002, "epoch": 2.5390625, "step": 1950}, {"loss": 1.6075, "grad_norm": 0.47357916831970215, "learning_rate": 0.0002, "epoch": 2.5520833333333335, "step": 1960}, {"loss": 1.5585, "grad_norm": 0.47136595845222473, "learning_rate": 0.0002, "epoch": 2.5651041666666665, "step": 1970}, {"loss": 1.5157, "grad_norm": 0.5185502171516418, "learning_rate": 0.0002, "epoch": 2.578125, "step": 1980}, {"loss": 1.6904, "grad_norm": 0.47995880246162415, "learning_rate": 0.0002, "epoch": 2.5911458333333335, "step": 1990}, {"loss": 1.638, "grad_norm": 0.5076674222946167, "learning_rate": 0.0002, "epoch": 2.6041666666666665, "step": 2000}, {"loss": 1.6038, "grad_norm": 0.4805421233177185, "learning_rate": 0.0002, "epoch": 2.6171875, "step": 2010}, {"loss": 1.6092, "grad_norm": 0.4406864047050476, "learning_rate": 0.0002, "epoch": 2.6302083333333335, "step": 2020}, {"loss": 1.6036, "grad_norm": 0.521388828754425, "learning_rate": 0.0002, "epoch": 2.6432291666666665, "step": 2030}, {"loss": 1.5338, "grad_norm": 0.4531918466091156, "learning_rate": 0.0002, "epoch": 2.65625, "step": 2040}, {"loss": 1.6853, "grad_norm": 0.45295774936676025, "learning_rate": 0.0002, "epoch": 2.6692708333333335, "step": 2050}, {"loss": 1.5252, "grad_norm": 0.4573723375797272, "learning_rate": 0.0002, "epoch": 2.6822916666666665, "step": 2060}, {"loss": 1.5765, "grad_norm": 0.4836064279079437, "learning_rate": 0.0002, "epoch": 2.6953125, "step": 2070}, {"loss": 1.5928, "grad_norm": 0.5040885210037231, "learning_rate": 0.0002, "epoch": 2.7083333333333335, "step": 2080}, {"loss": 1.6438, "grad_norm": 0.5153458118438721, "learning_rate": 0.0002, "epoch": 2.7213541666666665, "step": 2090}, {"loss": 1.5917, "grad_norm": 0.4415692090988159, "learning_rate": 0.0002, "epoch": 2.734375, "step": 2100}, {"loss": 1.6017, "grad_norm": 0.4862712621688843, "learning_rate": 0.0002, "epoch": 2.7473958333333335, "step": 2110}, {"loss": 1.5797, "grad_norm": 0.4845922589302063, "learning_rate": 0.0002, "epoch": 2.7604166666666665, "step": 2120}, {"loss": 1.6404, "grad_norm": 0.5153566598892212, "learning_rate": 0.0002, "epoch": 2.7734375, "step": 2130}, {"loss": 1.5609, "grad_norm": 0.4220491945743561, "learning_rate": 0.0002, "epoch": 2.7864583333333335, "step": 2140}, {"loss": 1.5404, "grad_norm": 0.523292064666748, "learning_rate": 0.0002, "epoch": 2.7994791666666665, "step": 2150}, {"loss": 1.4993, "grad_norm": 0.4567972421646118, "learning_rate": 0.0002, "epoch": 2.8125, "step": 2160}, {"loss": 1.6279, "grad_norm": 0.6252557039260864, "learning_rate": 0.0002, "epoch": 2.8255208333333335, "step": 2170}, {"loss": 1.6203, "grad_norm": 0.5231373310089111, "learning_rate": 0.0002, "epoch": 2.8385416666666665, "step": 2180}, {"loss": 1.5707, "grad_norm": 0.49243974685668945, "learning_rate": 0.0002, "epoch": 2.8515625, "step": 2190}, {"loss": 1.5923, "grad_norm": 0.521644115447998, "learning_rate": 0.0002, "epoch": 2.8645833333333335, "step": 2200}, {"loss": 1.6812, "grad_norm": 0.4624195694923401, "learning_rate": 0.0002, "epoch": 2.8776041666666665, "step": 2210}, {"loss": 1.6132, "grad_norm": 0.4463620185852051, "learning_rate": 0.0002, "epoch": 2.890625, "step": 2220}, {"loss": 1.6095, "grad_norm": 0.45793524384498596, "learning_rate": 0.0002, "epoch": 2.9036458333333335, "step": 2230}, {"loss": 1.5985, "grad_norm": 0.46979188919067383, "learning_rate": 0.0002, "epoch": 2.9166666666666665, "step": 2240}, {"loss": 1.617, "grad_norm": 0.5220303535461426, "learning_rate": 0.0002, "epoch": 2.9296875, "step": 2250}, {"loss": 1.5978, "grad_norm": 0.44405895471572876, "learning_rate": 0.0002, "epoch": 2.9427083333333335, "step": 2260}, {"loss": 1.6685, "grad_norm": 0.523841381072998, "learning_rate": 0.0002, "epoch": 2.9557291666666665, "step": 2270}, {"loss": 1.595, "grad_norm": 0.4928138852119446, "learning_rate": 0.0002, "epoch": 2.96875, "step": 2280}, {"loss": 1.606, "grad_norm": 0.4918071925640106, "learning_rate": 0.0002, "epoch": 2.9817708333333335, "step": 2290}, {"loss": 1.5736, "grad_norm": 0.4584912061691284, "learning_rate": 0.0002, "epoch": 2.9947916666666665, "step": 2300}, {"eval_loss": 1.8474308252334595, "eval_runtime": 103.7697, "eval_samples_per_second": 4.963, "eval_steps_per_second": 0.626, "epoch": 3.0, "step": 2304}, {"loss": 1.5454, "grad_norm": 0.4801871180534363, "learning_rate": 0.0002, "epoch": 3.0078125, "step": 2310}, {"loss": 1.4019, "grad_norm": 0.5789998173713684, "learning_rate": 0.0002, "epoch": 3.0208333333333335, "step": 2320}, {"loss": 1.4419, "grad_norm": 0.49856704473495483, "learning_rate": 0.0002, "epoch": 3.0338541666666665, "step": 2330}, {"loss": 1.4718, "grad_norm": 0.5625631213188171, "learning_rate": 0.0002, "epoch": 3.046875, "step": 2340}, {"loss": 1.4727, "grad_norm": 0.557637095451355, "learning_rate": 0.0002, "epoch": 3.0598958333333335, "step": 2350}, {"loss": 1.4654, "grad_norm": 0.528889536857605, "learning_rate": 0.0002, "epoch": 3.0729166666666665, "step": 2360}, {"loss": 1.4307, "grad_norm": 0.5952284932136536, "learning_rate": 0.0002, "epoch": 3.0859375, "step": 2370}, {"loss": 1.5304, "grad_norm": 0.5549899339675903, "learning_rate": 0.0002, "epoch": 3.0989583333333335, "step": 2380}, {"loss": 1.5034, "grad_norm": 0.662139892578125, "learning_rate": 0.0002, "epoch": 3.1119791666666665, "step": 2390}, {"loss": 1.4754, "grad_norm": 0.5281530618667603, "learning_rate": 0.0002, "epoch": 3.125, "step": 2400}, {"loss": 1.4047, "grad_norm": 0.6134106516838074, "learning_rate": 0.0002, "epoch": 3.1380208333333335, "step": 2410}, {"loss": 1.5001, "grad_norm": 0.6040887236595154, "learning_rate": 0.0002, "epoch": 3.1510416666666665, "step": 2420}, {"loss": 1.3936, "grad_norm": 0.549672544002533, "learning_rate": 0.0002, "epoch": 3.1640625, "step": 2430}, {"loss": 1.401, "grad_norm": 0.9195653796195984, "learning_rate": 0.0002, "epoch": 3.1770833333333335, "step": 2440}, {"loss": 1.507, "grad_norm": 0.5578703284263611, "learning_rate": 0.0002, "epoch": 3.1901041666666665, "step": 2450}, {"loss": 1.4873, "grad_norm": 0.5982925891876221, "learning_rate": 0.0002, "epoch": 3.203125, "step": 2460}, {"loss": 1.4909, "grad_norm": 0.5544393062591553, "learning_rate": 0.0002, "epoch": 3.2161458333333335, "step": 2470}, {"loss": 1.4705, "grad_norm": 0.6015266180038452, "learning_rate": 0.0002, "epoch": 3.2291666666666665, "step": 2480}, {"loss": 1.4652, "grad_norm": 0.5995243191719055, "learning_rate": 0.0002, "epoch": 3.2421875, "step": 2490}, {"loss": 1.4486, "grad_norm": 0.5846129059791565, "learning_rate": 0.0002, "epoch": 3.2552083333333335, "step": 2500}, {"loss": 1.4529, "grad_norm": 0.5552570223808289, "learning_rate": 0.0002, "epoch": 3.2682291666666665, "step": 2510}, {"loss": 1.3884, "grad_norm": 0.576998233795166, "learning_rate": 0.0002, "epoch": 3.28125, "step": 2520}, {"loss": 1.4463, "grad_norm": 0.6526138186454773, "learning_rate": 0.0002, "epoch": 3.2942708333333335, "step": 2530}, {"loss": 1.474, "grad_norm": 0.6064265966415405, "learning_rate": 0.0002, "epoch": 3.3072916666666665, "step": 2540}, {"loss": 1.5125, "grad_norm": 0.5542362928390503, "learning_rate": 0.0002, "epoch": 3.3203125, "step": 2550}, {"loss": 1.4769, "grad_norm": 0.6048482060432434, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2560}, {"loss": 1.4682, "grad_norm": 0.6328344941139221, "learning_rate": 0.0002, "epoch": 3.3463541666666665, "step": 2570}, {"loss": 1.5647, "grad_norm": 0.6347311735153198, "learning_rate": 0.0002, "epoch": 3.359375, "step": 2580}, {"loss": 1.5752, "grad_norm": 0.537570595741272, "learning_rate": 0.0002, "epoch": 3.3723958333333335, "step": 2590}, {"loss": 1.4086, "grad_norm": 0.5704807639122009, "learning_rate": 0.0002, "epoch": 3.3854166666666665, "step": 2600}, {"loss": 1.5653, "grad_norm": 0.5914373993873596, "learning_rate": 0.0002, "epoch": 3.3984375, "step": 2610}, {"loss": 1.4436, "grad_norm": 0.6724640130996704, "learning_rate": 0.0002, "epoch": 3.4114583333333335, "step": 2620}, {"loss": 1.5731, "grad_norm": 0.6295472383499146, "learning_rate": 0.0002, "epoch": 3.4244791666666665, "step": 2630}, {"loss": 1.4715, "grad_norm": 0.5842770934104919, "learning_rate": 0.0002, "epoch": 3.4375, "step": 2640}, {"loss": 1.451, "grad_norm": 0.6297776699066162, "learning_rate": 0.0002, "epoch": 3.4505208333333335, "step": 2650}, {"loss": 1.5761, "grad_norm": 0.6105847358703613, "learning_rate": 0.0002, "epoch": 3.4635416666666665, "step": 2660}, {"loss": 1.5332, "grad_norm": 0.6294940710067749, "learning_rate": 0.0002, "epoch": 3.4765625, "step": 2670}, {"loss": 1.5451, "grad_norm": 0.6573333740234375, "learning_rate": 0.0002, "epoch": 3.4895833333333335, "step": 2680}, {"loss": 1.4592, "grad_norm": 0.663661539554596, "learning_rate": 0.0002, "epoch": 3.5026041666666665, "step": 2690}, {"loss": 1.5286, "grad_norm": 0.6729148626327515, "learning_rate": 0.0002, "epoch": 3.515625, "step": 2700}, {"loss": 1.534, "grad_norm": 0.6633102893829346, "learning_rate": 0.0002, "epoch": 3.5286458333333335, "step": 2710}, {"loss": 1.4023, "grad_norm": 0.567686915397644, "learning_rate": 0.0002, "epoch": 3.5416666666666665, "step": 2720}, {"loss": 1.4925, "grad_norm": 0.6281962394714355, "learning_rate": 0.0002, "epoch": 3.5546875, "step": 2730}, {"loss": 1.5028, "grad_norm": 0.5710738897323608, "learning_rate": 0.0002, "epoch": 3.5677083333333335, "step": 2740}, {"loss": 1.4393, "grad_norm": 0.648162305355072, "learning_rate": 0.0002, "epoch": 3.5807291666666665, "step": 2750}, {"loss": 1.4294, "grad_norm": 0.5466254949569702, "learning_rate": 0.0002, "epoch": 3.59375, "step": 2760}, {"loss": 1.4993, "grad_norm": 0.6867973208427429, "learning_rate": 0.0002, "epoch": 3.6067708333333335, "step": 2770}, {"loss": 1.4463, "grad_norm": 0.673612117767334, "learning_rate": 0.0002, "epoch": 3.6197916666666665, "step": 2780}, {"loss": 1.5231, "grad_norm": 0.6928417086601257, "learning_rate": 0.0002, "epoch": 3.6328125, "step": 2790}, {"loss": 1.5212, "grad_norm": 0.6603742837905884, "learning_rate": 0.0002, "epoch": 3.6458333333333335, "step": 2800}, {"loss": 1.4889, "grad_norm": 0.5964401960372925, "learning_rate": 0.0002, "epoch": 3.6588541666666665, "step": 2810}, {"loss": 1.4585, "grad_norm": 0.6224474310874939, "learning_rate": 0.0002, "epoch": 3.671875, "step": 2820}, {"loss": 1.5119, "grad_norm": 0.6592439413070679, "learning_rate": 0.0002, "epoch": 3.6848958333333335, "step": 2830}, {"loss": 1.4729, "grad_norm": 0.6255369186401367, "learning_rate": 0.0002, "epoch": 3.6979166666666665, "step": 2840}, {"loss": 1.4598, "grad_norm": 0.7136337757110596, "learning_rate": 0.0002, "epoch": 3.7109375, "step": 2850}, {"loss": 1.4491, "grad_norm": 0.6229757070541382, "learning_rate": 0.0002, "epoch": 3.7239583333333335, "step": 2860}, {"loss": 1.4175, "grad_norm": 0.696080207824707, "learning_rate": 0.0002, "epoch": 3.7369791666666665, "step": 2870}, {"loss": 1.5127, "grad_norm": 0.571873664855957, "learning_rate": 0.0002, "epoch": 3.75, "step": 2880}, {"loss": 1.4093, "grad_norm": 0.5918916463851929, "learning_rate": 0.0002, "epoch": 3.7630208333333335, "step": 2890}, {"loss": 1.399, "grad_norm": 0.616413950920105, "learning_rate": 0.0002, "epoch": 3.7760416666666665, "step": 2900}, {"loss": 1.4215, "grad_norm": 0.6267292499542236, "learning_rate": 0.0002, "epoch": 3.7890625, "step": 2910}, {"loss": 1.5095, "grad_norm": 0.6630783677101135, "learning_rate": 0.0002, "epoch": 3.8020833333333335, "step": 2920}, {"loss": 1.5323, "grad_norm": 0.6004238724708557, "learning_rate": 0.0002, "epoch": 3.8151041666666665, "step": 2930}, {"loss": 1.4953, "grad_norm": 0.6740423440933228, "learning_rate": 0.0002, "epoch": 3.828125, "step": 2940}, {"loss": 1.549, "grad_norm": 0.6397785544395447, "learning_rate": 0.0002, "epoch": 3.8411458333333335, "step": 2950}, {"loss": 1.5309, "grad_norm": 0.6063735485076904, "learning_rate": 0.0002, "epoch": 3.8541666666666665, "step": 2960}, {"loss": 1.5093, "grad_norm": 0.6462053060531616, "learning_rate": 0.0002, "epoch": 3.8671875, "step": 2970}, {"loss": 1.5237, "grad_norm": 0.7143250107765198, "learning_rate": 0.0002, "epoch": 3.8802083333333335, "step": 2980}, {"loss": 1.4419, "grad_norm": 0.6747874617576599, "learning_rate": 0.0002, "epoch": 3.8932291666666665, "step": 2990}, {"loss": 1.5389, "grad_norm": 0.622930109500885, "learning_rate": 0.0002, "epoch": 3.90625, "step": 3000}, {"loss": 1.4279, "grad_norm": 0.620193600654602, "learning_rate": 0.0002, "epoch": 3.9192708333333335, "step": 3010}, {"loss": 1.495, "grad_norm": 0.6321487426757812, "learning_rate": 0.0002, "epoch": 3.9322916666666665, "step": 3020}, {"loss": 1.4657, "grad_norm": 0.5705523490905762, "learning_rate": 0.0002, "epoch": 3.9453125, "step": 3030}, {"loss": 1.4099, "grad_norm": 0.6185072660446167, "learning_rate": 0.0002, "epoch": 3.9583333333333335, "step": 3040}, {"loss": 1.4667, "grad_norm": 0.6005704998970032, "learning_rate": 0.0002, "epoch": 3.9713541666666665, "step": 3050}, {"loss": 1.4896, "grad_norm": 0.5933769941329956, "learning_rate": 0.0002, "epoch": 3.984375, "step": 3060}, {"loss": 1.4973, "grad_norm": 0.695209801197052, "learning_rate": 0.0002, "epoch": 3.9973958333333335, "step": 3070}, {"eval_loss": 1.8955267667770386, "eval_runtime": 103.5061, "eval_samples_per_second": 4.976, "eval_steps_per_second": 0.628, "epoch": 4.0, "step": 3072}, {"loss": 1.3502, "grad_norm": 0.6706188321113586, "learning_rate": 0.0002, "epoch": 4.010416666666667, "step": 3080}, {"loss": 1.2917, "grad_norm": 0.7263980507850647, "learning_rate": 0.0002, "epoch": 4.0234375, "step": 3090}, {"loss": 1.2845, "grad_norm": 0.7767240405082703, "learning_rate": 0.0002, "epoch": 4.036458333333333, "step": 3100}, {"loss": 1.4169, "grad_norm": 0.6888399124145508, "learning_rate": 0.0002, "epoch": 4.049479166666667, "step": 3110}, {"loss": 1.2422, "grad_norm": 0.8860331773757935, "learning_rate": 0.0002, "epoch": 4.0625, "step": 3120}, {"loss": 1.2842, "grad_norm": 0.7572373151779175, "learning_rate": 0.0002, "epoch": 4.075520833333333, "step": 3130}, {"loss": 1.2747, "grad_norm": 0.8321536183357239, "learning_rate": 0.0002, "epoch": 4.088541666666667, "step": 3140}, {"loss": 1.2843, "grad_norm": 0.7042664885520935, "learning_rate": 0.0002, "epoch": 4.1015625, "step": 3150}, {"loss": 1.3326, "grad_norm": 0.8910216689109802, "learning_rate": 0.0002, "epoch": 4.114583333333333, "step": 3160}, {"loss": 1.2742, "grad_norm": 0.8333232402801514, "learning_rate": 0.0002, "epoch": 4.127604166666667, "step": 3170}, {"loss": 1.2985, "grad_norm": 0.7120883464813232, "learning_rate": 0.0002, "epoch": 4.140625, "step": 3180}, {"loss": 1.3611, "grad_norm": 0.6904631853103638, "learning_rate": 0.0002, "epoch": 4.153645833333333, "step": 3190}, {"loss": 1.2881, "grad_norm": 0.6398878693580627, "learning_rate": 0.0002, "epoch": 4.166666666666667, "step": 3200}, {"loss": 1.3323, "grad_norm": 0.7573692798614502, "learning_rate": 0.0002, "epoch": 4.1796875, "step": 3210}, {"loss": 1.3509, "grad_norm": 0.7850743532180786, "learning_rate": 0.0002, "epoch": 4.192708333333333, "step": 3220}, {"loss": 1.3176, "grad_norm": 0.7863165736198425, "learning_rate": 0.0002, "epoch": 4.205729166666667, "step": 3230}, {"loss": 1.3739, "grad_norm": 0.7855865359306335, "learning_rate": 0.0002, "epoch": 4.21875, "step": 3240}, {"loss": 1.3251, "grad_norm": 0.6840922832489014, "learning_rate": 0.0002, "epoch": 4.231770833333333, "step": 3250}, {"loss": 1.32, "grad_norm": 0.8499747514724731, "learning_rate": 0.0002, "epoch": 4.244791666666667, "step": 3260}, {"loss": 1.4045, "grad_norm": 0.7982883453369141, "learning_rate": 0.0002, "epoch": 4.2578125, "step": 3270}, {"loss": 1.3922, "grad_norm": 0.7776934504508972, "learning_rate": 0.0002, "epoch": 4.270833333333333, "step": 3280}, {"loss": 1.309, "grad_norm": 0.8887693881988525, "learning_rate": 0.0002, "epoch": 4.283854166666667, "step": 3290}, {"loss": 1.3213, "grad_norm": 1.0184714794158936, "learning_rate": 0.0002, "epoch": 4.296875, "step": 3300}, {"loss": 1.3212, "grad_norm": 0.7539387345314026, "learning_rate": 0.0002, "epoch": 4.309895833333333, "step": 3310}, {"loss": 1.3403, "grad_norm": 0.8137491345405579, "learning_rate": 0.0002, "epoch": 4.322916666666667, "step": 3320}, {"loss": 1.3069, "grad_norm": 0.8136276006698608, "learning_rate": 0.0002, "epoch": 4.3359375, "step": 3330}, {"loss": 1.3512, "grad_norm": 0.7880964279174805, "learning_rate": 0.0002, "epoch": 4.348958333333333, "step": 3340}, {"loss": 1.3468, "grad_norm": 0.8654456734657288, "learning_rate": 0.0002, "epoch": 4.361979166666667, "step": 3350}, {"loss": 1.3036, "grad_norm": 0.8093366622924805, "learning_rate": 0.0002, "epoch": 4.375, "step": 3360}, {"loss": 1.3826, "grad_norm": 0.8738575577735901, "learning_rate": 0.0002, "epoch": 4.388020833333333, "step": 3370}, {"loss": 1.3485, "grad_norm": 0.8923026919364929, "learning_rate": 0.0002, "epoch": 4.401041666666667, "step": 3380}, {"loss": 1.3628, "grad_norm": 0.8508910536766052, "learning_rate": 0.0002, "epoch": 4.4140625, "step": 3390}, {"loss": 1.3048, "grad_norm": 0.8262084722518921, "learning_rate": 0.0002, "epoch": 4.427083333333333, "step": 3400}, {"loss": 1.3145, "grad_norm": 0.7843561768531799, "learning_rate": 0.0002, "epoch": 4.440104166666667, "step": 3410}, {"loss": 1.4526, "grad_norm": 0.9087795615196228, "learning_rate": 0.0002, "epoch": 4.453125, "step": 3420}, {"loss": 1.3492, "grad_norm": 0.8278809189796448, "learning_rate": 0.0002, "epoch": 4.466145833333333, "step": 3430}, {"loss": 1.3797, "grad_norm": 0.8337010741233826, "learning_rate": 0.0002, "epoch": 4.479166666666667, "step": 3440}, {"loss": 1.3199, "grad_norm": 0.7790088057518005, "learning_rate": 0.0002, "epoch": 4.4921875, "step": 3450}, {"loss": 1.3344, "grad_norm": 0.826231837272644, "learning_rate": 0.0002, "epoch": 4.505208333333333, "step": 3460}, {"loss": 1.3915, "grad_norm": 0.761461079120636, "learning_rate": 0.0002, "epoch": 4.518229166666667, "step": 3470}, {"loss": 1.2829, "grad_norm": 0.8892785906791687, "learning_rate": 0.0002, "epoch": 4.53125, "step": 3480}, {"loss": 1.3571, "grad_norm": 0.6087225675582886, "learning_rate": 0.0002, "epoch": 4.544270833333333, "step": 3490}, {"loss": 1.3167, "grad_norm": 0.8259274363517761, "learning_rate": 0.0002, "epoch": 4.557291666666667, "step": 3500}, {"loss": 1.3664, "grad_norm": 0.821164071559906, "learning_rate": 0.0002, "epoch": 4.5703125, "step": 3510}, {"loss": 1.2853, "grad_norm": 0.7262887954711914, "learning_rate": 0.0002, "epoch": 4.583333333333333, "step": 3520}, {"loss": 1.3777, "grad_norm": 0.8564826250076294, "learning_rate": 0.0002, "epoch": 4.596354166666667, "step": 3530}, {"loss": 1.3238, "grad_norm": 0.8072929978370667, "learning_rate": 0.0002, "epoch": 4.609375, "step": 3540}, {"loss": 1.43, "grad_norm": 0.8040832877159119, "learning_rate": 0.0002, "epoch": 4.622395833333333, "step": 3550}, {"loss": 1.2863, "grad_norm": 0.7268754839897156, "learning_rate": 0.0002, "epoch": 4.635416666666667, "step": 3560}, {"loss": 1.3485, "grad_norm": 0.9985134601593018, "learning_rate": 0.0002, "epoch": 4.6484375, "step": 3570}, {"loss": 1.3221, "grad_norm": 0.9826098680496216, "learning_rate": 0.0002, "epoch": 4.661458333333333, "step": 3580}, {"loss": 1.2878, "grad_norm": 0.8794422149658203, "learning_rate": 0.0002, "epoch": 4.674479166666667, "step": 3590}, {"loss": 1.3674, "grad_norm": 0.7207489609718323, "learning_rate": 0.0002, "epoch": 4.6875, "step": 3600}, {"loss": 1.3192, "grad_norm": 0.7546059489250183, "learning_rate": 0.0002, "epoch": 4.700520833333333, "step": 3610}, {"loss": 1.3445, "grad_norm": 0.8318526148796082, "learning_rate": 0.0002, "epoch": 4.713541666666667, "step": 3620}, {"loss": 1.3847, "grad_norm": 0.7529309391975403, "learning_rate": 0.0002, "epoch": 4.7265625, "step": 3630}, {"loss": 1.4208, "grad_norm": 0.7762532234191895, "learning_rate": 0.0002, "epoch": 4.739583333333333, "step": 3640}, {"loss": 1.4162, "grad_norm": 0.9306083917617798, "learning_rate": 0.0002, "epoch": 4.752604166666667, "step": 3650}, {"loss": 1.3828, "grad_norm": 0.8050256967544556, "learning_rate": 0.0002, "epoch": 4.765625, "step": 3660}, {"loss": 1.3671, "grad_norm": 0.8114449381828308, "learning_rate": 0.0002, "epoch": 4.778645833333333, "step": 3670}, {"loss": 1.3296, "grad_norm": 0.8125811815261841, "learning_rate": 0.0002, "epoch": 4.791666666666667, "step": 3680}, {"loss": 1.3222, "grad_norm": 0.7642565369606018, "learning_rate": 0.0002, "epoch": 4.8046875, "step": 3690}, {"loss": 1.2842, "grad_norm": 0.8970131874084473, "learning_rate": 0.0002, "epoch": 4.817708333333333, "step": 3700}, {"loss": 1.3983, "grad_norm": 0.7654327154159546, "learning_rate": 0.0002, "epoch": 4.830729166666667, "step": 3710}, {"loss": 1.3746, "grad_norm": 0.7605378031730652, "learning_rate": 0.0002, "epoch": 4.84375, "step": 3720}, {"loss": 1.3149, "grad_norm": 0.8340551257133484, "learning_rate": 0.0002, "epoch": 4.856770833333333, "step": 3730}, {"loss": 1.4309, "grad_norm": 0.7273691296577454, "learning_rate": 0.0002, "epoch": 4.869791666666667, "step": 3740}, {"loss": 1.3094, "grad_norm": 0.9718272686004639, "learning_rate": 0.0002, "epoch": 4.8828125, "step": 3750}, {"loss": 1.296, "grad_norm": 0.7891847491264343, "learning_rate": 0.0002, "epoch": 4.895833333333333, "step": 3760}, {"loss": 1.4613, "grad_norm": 0.9090818166732788, "learning_rate": 0.0002, "epoch": 4.908854166666667, "step": 3770}, {"loss": 1.3478, "grad_norm": 0.7963318824768066, "learning_rate": 0.0002, "epoch": 4.921875, "step": 3780}, {"loss": 1.3558, "grad_norm": 0.7588343620300293, "learning_rate": 0.0002, "epoch": 4.934895833333333, "step": 3790}, {"loss": 1.3664, "grad_norm": 0.84076327085495, "learning_rate": 0.0002, "epoch": 4.947916666666667, "step": 3800}, {"loss": 1.2836, "grad_norm": 0.7767227292060852, "learning_rate": 0.0002, "epoch": 4.9609375, "step": 3810}, {"loss": 1.3925, "grad_norm": 0.8101866245269775, "learning_rate": 0.0002, "epoch": 4.973958333333333, "step": 3820}, {"loss": 1.3881, "grad_norm": 0.7808696627616882, "learning_rate": 0.0002, "epoch": 4.986979166666667, "step": 3830}, {"loss": 1.4475, "grad_norm": 0.9609483480453491, "learning_rate": 0.0002, "epoch": 5.0, "step": 3840}, {"eval_loss": 1.9610719680786133, "eval_runtime": 87.6572, "eval_samples_per_second": 5.875, "eval_steps_per_second": 0.742, "epoch": 5.0, "step": 3840}, {"loss": 1.1603, "grad_norm": 0.9366803765296936, "learning_rate": 0.0002, "epoch": 5.013020833333333, "step": 3850}, {"loss": 1.1931, "grad_norm": 0.8014302849769592, "learning_rate": 0.0002, "epoch": 5.026041666666667, "step": 3860}, {"loss": 1.1418, "grad_norm": 0.977936863899231, "learning_rate": 0.0002, "epoch": 5.0390625, "step": 3870}, {"loss": 1.1258, "grad_norm": 1.045047640800476, "learning_rate": 0.0002, "epoch": 5.052083333333333, "step": 3880}, {"loss": 1.1709, "grad_norm": 1.125620722770691, "learning_rate": 0.0002, "epoch": 5.065104166666667, "step": 3890}, {"loss": 1.1954, "grad_norm": 1.1565124988555908, "learning_rate": 0.0002, "epoch": 5.078125, "step": 3900}, {"loss": 1.1753, "grad_norm": 1.102354884147644, "learning_rate": 0.0002, "epoch": 5.091145833333333, "step": 3910}, {"loss": 1.1632, "grad_norm": 0.9567629098892212, "learning_rate": 0.0002, "epoch": 5.104166666666667, "step": 3920}, {"loss": 1.1875, "grad_norm": 0.9760252833366394, "learning_rate": 0.0002, "epoch": 5.1171875, "step": 3930}, {"loss": 1.2289, "grad_norm": 1.026168704032898, "learning_rate": 0.0002, "epoch": 5.130208333333333, "step": 3940}, {"loss": 1.1598, "grad_norm": 1.1490436792373657, "learning_rate": 0.0002, "epoch": 5.143229166666667, "step": 3950}, {"loss": 1.0823, "grad_norm": 0.9712087512016296, "learning_rate": 0.0002, "epoch": 5.15625, "step": 3960}, {"loss": 1.1948, "grad_norm": 1.0095003843307495, "learning_rate": 0.0002, "epoch": 5.169270833333333, "step": 3970}, {"loss": 1.1617, "grad_norm": 0.9171855449676514, "learning_rate": 0.0002, "epoch": 5.182291666666667, "step": 3980}, {"loss": 1.161, "grad_norm": 1.0105657577514648, "learning_rate": 0.0002, "epoch": 5.1953125, "step": 3990}, {"loss": 1.2098, "grad_norm": 1.0330145359039307, "learning_rate": 0.0002, "epoch": 5.208333333333333, "step": 4000}, {"loss": 1.1965, "grad_norm": 1.0676906108856201, "learning_rate": 0.0002, "epoch": 5.221354166666667, "step": 4010}, {"loss": 1.1392, "grad_norm": 1.055088758468628, "learning_rate": 0.0002, "epoch": 5.234375, "step": 4020}, {"loss": 1.2173, "grad_norm": 0.9523683786392212, "learning_rate": 0.0002, "epoch": 5.247395833333333, "step": 4030}, {"loss": 1.1167, "grad_norm": 0.9013799428939819, "learning_rate": 0.0002, "epoch": 5.260416666666667, "step": 4040}, {"loss": 1.2274, "grad_norm": 0.9379037618637085, "learning_rate": 0.0002, "epoch": 5.2734375, "step": 4050}, {"loss": 1.1246, "grad_norm": 0.9565327763557434, "learning_rate": 0.0002, "epoch": 5.286458333333333, "step": 4060}, {"loss": 1.2103, "grad_norm": 1.1994404792785645, "learning_rate": 0.0002, "epoch": 5.299479166666667, "step": 4070}, {"loss": 1.2016, "grad_norm": 1.0563262701034546, "learning_rate": 0.0002, "epoch": 5.3125, "step": 4080}, {"loss": 1.2478, "grad_norm": 1.024290680885315, "learning_rate": 0.0002, "epoch": 5.325520833333333, "step": 4090}, {"loss": 1.2388, "grad_norm": 1.0022907257080078, "learning_rate": 0.0002, "epoch": 5.338541666666667, "step": 4100}, {"loss": 1.1948, "grad_norm": 0.9642180800437927, "learning_rate": 0.0002, "epoch": 5.3515625, "step": 4110}, {"loss": 1.231, "grad_norm": 1.0228009223937988, "learning_rate": 0.0002, "epoch": 5.364583333333333, "step": 4120}, {"loss": 1.2341, "grad_norm": 1.0379719734191895, "learning_rate": 0.0002, "epoch": 5.377604166666667, "step": 4130}, {"loss": 1.24, "grad_norm": 1.147053599357605, "learning_rate": 0.0002, "epoch": 5.390625, "step": 4140}, {"loss": 1.2026, "grad_norm": 1.2097876071929932, "learning_rate": 0.0002, "epoch": 5.403645833333333, "step": 4150}, {"loss": 1.1978, "grad_norm": 1.0852497816085815, "learning_rate": 0.0002, "epoch": 5.416666666666667, "step": 4160}, {"loss": 1.2182, "grad_norm": 0.9765135645866394, "learning_rate": 0.0002, "epoch": 5.4296875, "step": 4170}, {"loss": 1.3117, "grad_norm": 1.0180606842041016, "learning_rate": 0.0002, "epoch": 5.442708333333333, "step": 4180}, {"loss": 1.2355, "grad_norm": 1.185409665107727, "learning_rate": 0.0002, "epoch": 5.455729166666667, "step": 4190}, {"loss": 1.1531, "grad_norm": 0.9363358020782471, "learning_rate": 0.0002, "epoch": 5.46875, "step": 4200}, {"loss": 1.1645, "grad_norm": 1.0761215686798096, "learning_rate": 0.0002, "epoch": 5.481770833333333, "step": 4210}, {"loss": 1.1465, "grad_norm": 1.057626724243164, "learning_rate": 0.0002, "epoch": 5.494791666666667, "step": 4220}, {"loss": 1.2051, "grad_norm": 1.0103157758712769, "learning_rate": 0.0002, "epoch": 5.5078125, "step": 4230}, {"loss": 1.2193, "grad_norm": 1.1056627035140991, "learning_rate": 0.0002, "epoch": 5.520833333333333, "step": 4240}, {"loss": 1.1941, "grad_norm": 1.0256257057189941, "learning_rate": 0.0002, "epoch": 5.533854166666667, "step": 4250}, {"loss": 1.1724, "grad_norm": 1.2814106941223145, "learning_rate": 0.0002, "epoch": 5.546875, "step": 4260}, {"loss": 1.1676, "grad_norm": 0.9044927954673767, "learning_rate": 0.0002, "epoch": 5.559895833333333, "step": 4270}, {"loss": 1.2448, "grad_norm": 0.9870165586471558, "learning_rate": 0.0002, "epoch": 5.572916666666667, "step": 4280}, {"loss": 1.2414, "grad_norm": 0.9867369532585144, "learning_rate": 0.0002, "epoch": 5.5859375, "step": 4290}, {"loss": 1.2115, "grad_norm": 1.045625925064087, "learning_rate": 0.0002, "epoch": 5.598958333333333, "step": 4300}, {"loss": 1.2786, "grad_norm": 0.979853630065918, "learning_rate": 0.0002, "epoch": 5.611979166666667, "step": 4310}, {"loss": 1.1629, "grad_norm": 1.029212236404419, "learning_rate": 0.0002, "epoch": 5.625, "step": 4320}, {"loss": 1.1985, "grad_norm": 1.0348633527755737, "learning_rate": 0.0002, "epoch": 5.638020833333333, "step": 4330}, {"loss": 1.1914, "grad_norm": 1.0055185556411743, "learning_rate": 0.0002, "epoch": 5.651041666666667, "step": 4340}, {"loss": 1.2658, "grad_norm": 0.9312447309494019, "learning_rate": 0.0002, "epoch": 5.6640625, "step": 4350}, {"loss": 1.1901, "grad_norm": 1.1411694288253784, "learning_rate": 0.0002, "epoch": 5.677083333333333, "step": 4360}, {"loss": 1.2679, "grad_norm": 0.9764434695243835, "learning_rate": 0.0002, "epoch": 5.690104166666667, "step": 4370}, {"loss": 1.2215, "grad_norm": 1.079154133796692, "learning_rate": 0.0002, "epoch": 5.703125, "step": 4380}, {"loss": 1.1659, "grad_norm": 0.999526858329773, "learning_rate": 0.0002, "epoch": 5.716145833333333, "step": 4390}, {"loss": 1.1685, "grad_norm": 1.1239734888076782, "learning_rate": 0.0002, "epoch": 5.729166666666667, "step": 4400}, {"loss": 1.1126, "grad_norm": 1.0539512634277344, "learning_rate": 0.0002, "epoch": 5.7421875, "step": 4410}, {"loss": 1.1413, "grad_norm": 0.9884052872657776, "learning_rate": 0.0002, "epoch": 5.755208333333333, "step": 4420}, {"loss": 1.1781, "grad_norm": 0.9821958541870117, "learning_rate": 0.0002, "epoch": 5.768229166666667, "step": 4430}, {"loss": 1.2319, "grad_norm": 0.9340839982032776, "learning_rate": 0.0002, "epoch": 5.78125, "step": 4440}, {"loss": 1.3085, "grad_norm": 0.9935781955718994, "learning_rate": 0.0002, "epoch": 5.794270833333333, "step": 4450}, {"loss": 1.1726, "grad_norm": 1.1027121543884277, "learning_rate": 0.0002, "epoch": 5.807291666666667, "step": 4460}, {"loss": 1.2385, "grad_norm": 0.9388337135314941, "learning_rate": 0.0002, "epoch": 5.8203125, "step": 4470}, {"loss": 1.259, "grad_norm": 1.0957310199737549, "learning_rate": 0.0002, "epoch": 5.833333333333333, "step": 4480}, {"loss": 1.3017, "grad_norm": 1.0832754373550415, "learning_rate": 0.0002, "epoch": 5.846354166666667, "step": 4490}, {"loss": 1.1724, "grad_norm": 0.9498379826545715, "learning_rate": 0.0002, "epoch": 5.859375, "step": 4500}, {"loss": 1.2312, "grad_norm": 0.9104725122451782, "learning_rate": 0.0002, "epoch": 5.872395833333333, "step": 4510}, {"loss": 1.204, "grad_norm": 1.2238177061080933, "learning_rate": 0.0002, "epoch": 5.885416666666667, "step": 4520}, {"loss": 1.2163, "grad_norm": 1.0549527406692505, "learning_rate": 0.0002, "epoch": 5.8984375, "step": 4530}, {"loss": 1.3086, "grad_norm": 1.0415066480636597, "learning_rate": 0.0002, "epoch": 5.911458333333333, "step": 4540}, {"loss": 1.1744, "grad_norm": 0.9098646640777588, "learning_rate": 0.0002, "epoch": 5.924479166666667, "step": 4550}, {"loss": 1.2126, "grad_norm": 0.9182857275009155, "learning_rate": 0.0002, "epoch": 5.9375, "step": 4560}, {"loss": 1.2341, "grad_norm": 1.088038444519043, "learning_rate": 0.0002, "epoch": 5.950520833333333, "step": 4570}, {"loss": 1.2317, "grad_norm": 1.1331020593643188, "learning_rate": 0.0002, "epoch": 5.963541666666667, "step": 4580}, {"loss": 1.2318, "grad_norm": 0.9592235088348389, "learning_rate": 0.0002, "epoch": 5.9765625, "step": 4590}, {"loss": 1.1995, "grad_norm": 1.0126368999481201, "learning_rate": 0.0002, "epoch": 5.989583333333333, "step": 4600}, {"eval_loss": 2.096651315689087, "eval_runtime": 43.1936, "eval_samples_per_second": 11.923, "eval_steps_per_second": 1.505, "epoch": 6.0, "step": 4608}, {"loss": 1.2061, "grad_norm": 1.0549334287643433, "learning_rate": 0.0002, "epoch": 6.002604166666667, "step": 4610}, {"loss": 1.0046, "grad_norm": 1.099247694015503, "learning_rate": 0.0002, "epoch": 6.015625, "step": 4620}, {"loss": 1.0542, "grad_norm": 1.0992592573165894, "learning_rate": 0.0002, "epoch": 6.028645833333333, "step": 4630}, {"loss": 1.0032, "grad_norm": 1.139350414276123, "learning_rate": 0.0002, "epoch": 6.041666666666667, "step": 4640}, {"loss": 1.0105, "grad_norm": 1.1316219568252563, "learning_rate": 0.0002, "epoch": 6.0546875, "step": 4650}, {"loss": 1.05, "grad_norm": 1.5254799127578735, "learning_rate": 0.0002, "epoch": 6.067708333333333, "step": 4660}, {"loss": 1.0357, "grad_norm": 1.155513048171997, "learning_rate": 0.0002, "epoch": 6.080729166666667, "step": 4670}, {"loss": 1.0782, "grad_norm": 1.311339259147644, "learning_rate": 0.0002, "epoch": 6.09375, "step": 4680}, {"loss": 1.098, "grad_norm": 0.9942600131034851, "learning_rate": 0.0002, "epoch": 6.106770833333333, "step": 4690}, {"loss": 0.9989, "grad_norm": 1.388214111328125, "learning_rate": 0.0002, "epoch": 6.119791666666667, "step": 4700}, {"loss": 1.0893, "grad_norm": 1.260488510131836, "learning_rate": 0.0002, "epoch": 6.1328125, "step": 4710}, {"loss": 1.0225, "grad_norm": 1.231615662574768, "learning_rate": 0.0002, "epoch": 6.145833333333333, "step": 4720}, {"loss": 1.0547, "grad_norm": 1.049696922302246, "learning_rate": 0.0002, "epoch": 6.158854166666667, "step": 4730}, {"loss": 1.0089, "grad_norm": 1.145426869392395, "learning_rate": 0.0002, "epoch": 6.171875, "step": 4740}, {"loss": 1.0751, "grad_norm": 1.1715868711471558, "learning_rate": 0.0002, "epoch": 6.184895833333333, "step": 4750}, {"loss": 0.9901, "grad_norm": 1.2575212717056274, "learning_rate": 0.0002, "epoch": 6.197916666666667, "step": 4760}, {"loss": 0.9775, "grad_norm": 1.2996530532836914, "learning_rate": 0.0002, "epoch": 6.2109375, "step": 4770}, {"loss": 1.0227, "grad_norm": 1.4030718803405762, "learning_rate": 0.0002, "epoch": 6.223958333333333, "step": 4780}, {"loss": 1.0439, "grad_norm": 1.2140913009643555, "learning_rate": 0.0002, "epoch": 6.236979166666667, "step": 4790}, {"loss": 1.0637, "grad_norm": 1.3512893915176392, "learning_rate": 0.0002, "epoch": 6.25, "step": 4800}, {"loss": 1.0367, "grad_norm": 1.1931439638137817, "learning_rate": 0.0002, "epoch": 6.263020833333333, "step": 4810}, {"loss": 1.0615, "grad_norm": 1.0379345417022705, "learning_rate": 0.0002, "epoch": 6.276041666666667, "step": 4820}, {"loss": 1.0954, "grad_norm": 1.1571568250656128, "learning_rate": 0.0002, "epoch": 6.2890625, "step": 4830}, {"loss": 1.0029, "grad_norm": 1.0717264413833618, "learning_rate": 0.0002, "epoch": 6.302083333333333, "step": 4840}, {"loss": 1.0466, "grad_norm": 1.360496997833252, "learning_rate": 0.0002, "epoch": 6.315104166666667, "step": 4850}, {"loss": 1.001, "grad_norm": 1.0864052772521973, "learning_rate": 0.0002, "epoch": 6.328125, "step": 4860}, {"loss": 1.0229, "grad_norm": 1.3391871452331543, "learning_rate": 0.0002, "epoch": 6.341145833333333, "step": 4870}, {"loss": 1.0797, "grad_norm": 1.2568541765213013, "learning_rate": 0.0002, "epoch": 6.354166666666667, "step": 4880}, {"loss": 1.1076, "grad_norm": 1.255483627319336, "learning_rate": 0.0002, "epoch": 6.3671875, "step": 4890}, {"loss": 1.0244, "grad_norm": 1.173972487449646, "learning_rate": 0.0002, "epoch": 6.380208333333333, "step": 4900}, {"loss": 1.0238, "grad_norm": 1.14010488986969, "learning_rate": 0.0002, "epoch": 6.393229166666667, "step": 4910}, {"loss": 1.0319, "grad_norm": 1.1317493915557861, "learning_rate": 0.0002, "epoch": 6.40625, "step": 4920}, {"loss": 1.0195, "grad_norm": 1.1547486782073975, "learning_rate": 0.0002, "epoch": 6.419270833333333, "step": 4930}, {"loss": 1.0456, "grad_norm": 1.1822998523712158, "learning_rate": 0.0002, "epoch": 6.432291666666667, "step": 4940}, {"loss": 1.0535, "grad_norm": 1.1865756511688232, "learning_rate": 0.0002, "epoch": 6.4453125, "step": 4950}, {"loss": 1.0255, "grad_norm": 1.13661789894104, "learning_rate": 0.0002, "epoch": 6.458333333333333, "step": 4960}, {"loss": 1.0771, "grad_norm": 1.047326683998108, "learning_rate": 0.0002, "epoch": 6.471354166666667, "step": 4970}, {"loss": 1.0965, "grad_norm": 1.3550827503204346, "learning_rate": 0.0002, "epoch": 6.484375, "step": 4980}, {"loss": 1.0984, "grad_norm": 1.2868435382843018, "learning_rate": 0.0002, "epoch": 6.497395833333333, "step": 4990}, {"loss": 1.1046, "grad_norm": 1.4678666591644287, "learning_rate": 0.0002, "epoch": 6.510416666666667, "step": 5000}, {"loss": 1.076, "grad_norm": 1.3739159107208252, "learning_rate": 0.0002, "epoch": 6.5234375, "step": 5010}, {"loss": 1.046, "grad_norm": 1.213034987449646, "learning_rate": 0.0002, "epoch": 6.536458333333333, "step": 5020}, {"loss": 1.1129, "grad_norm": 1.5025049448013306, "learning_rate": 0.0002, "epoch": 6.549479166666667, "step": 5030}, {"loss": 1.0564, "grad_norm": 1.1811821460723877, "learning_rate": 0.0002, "epoch": 6.5625, "step": 5040}, {"loss": 1.1096, "grad_norm": 1.2845960855484009, "learning_rate": 0.0002, "epoch": 6.575520833333333, "step": 5050}, {"loss": 1.0274, "grad_norm": 1.0641103982925415, "learning_rate": 0.0002, "epoch": 6.588541666666667, "step": 5060}, {"loss": 1.0559, "grad_norm": 1.0967134237289429, "learning_rate": 0.0002, "epoch": 6.6015625, "step": 5070}, {"loss": 1.0965, "grad_norm": 1.1802116632461548, "learning_rate": 0.0002, "epoch": 6.614583333333333, "step": 5080}, {"loss": 1.0296, "grad_norm": 1.3110308647155762, "learning_rate": 0.0002, "epoch": 6.627604166666667, "step": 5090}, {"loss": 1.0273, "grad_norm": 1.1863301992416382, "learning_rate": 0.0002, "epoch": 6.640625, "step": 5100}, {"loss": 1.1355, "grad_norm": 1.0931109189987183, "learning_rate": 0.0002, "epoch": 6.653645833333333, "step": 5110}, {"loss": 1.1025, "grad_norm": 1.0571614503860474, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 5120}, {"loss": 1.1292, "grad_norm": 1.2855656147003174, "learning_rate": 0.0002, "epoch": 6.6796875, "step": 5130}, {"loss": 1.0582, "grad_norm": 1.2217806577682495, "learning_rate": 0.0002, "epoch": 6.692708333333333, "step": 5140}, {"loss": 1.1098, "grad_norm": 1.093658447265625, "learning_rate": 0.0002, "epoch": 6.705729166666667, "step": 5150}, {"loss": 1.0845, "grad_norm": 1.2592076063156128, "learning_rate": 0.0002, "epoch": 6.71875, "step": 5160}, {"loss": 1.0381, "grad_norm": 1.0720105171203613, "learning_rate": 0.0002, "epoch": 6.731770833333333, "step": 5170}, {"loss": 1.0707, "grad_norm": 1.178058385848999, "learning_rate": 0.0002, "epoch": 6.744791666666667, "step": 5180}, {"loss": 1.116, "grad_norm": 1.1897447109222412, "learning_rate": 0.0002, "epoch": 6.7578125, "step": 5190}, {"loss": 1.064, "grad_norm": 1.3547686338424683, "learning_rate": 0.0002, "epoch": 6.770833333333333, "step": 5200}, {"loss": 1.0642, "grad_norm": 1.2514727115631104, "learning_rate": 0.0002, "epoch": 6.783854166666667, "step": 5210}, {"loss": 1.0898, "grad_norm": 1.5253846645355225, "learning_rate": 0.0002, "epoch": 6.796875, "step": 5220}, {"loss": 1.0426, "grad_norm": 1.090774655342102, "learning_rate": 0.0002, "epoch": 6.809895833333333, "step": 5230}, {"loss": 1.0867, "grad_norm": 1.1387991905212402, "learning_rate": 0.0002, "epoch": 6.822916666666667, "step": 5240}, {"loss": 1.0493, "grad_norm": 1.102423906326294, "learning_rate": 0.0002, "epoch": 6.8359375, "step": 5250}, {"loss": 1.0976, "grad_norm": 1.2453415393829346, "learning_rate": 0.0002, "epoch": 6.848958333333333, "step": 5260}, {"loss": 1.1046, "grad_norm": 1.2541141510009766, "learning_rate": 0.0002, "epoch": 6.861979166666667, "step": 5270}, {"loss": 1.0816, "grad_norm": 1.2719744443893433, "learning_rate": 0.0002, "epoch": 6.875, "step": 5280}, {"loss": 1.0399, "grad_norm": 1.085763931274414, "learning_rate": 0.0002, "epoch": 6.888020833333333, "step": 5290}, {"loss": 1.1306, "grad_norm": 1.2399879693984985, "learning_rate": 0.0002, "epoch": 6.901041666666667, "step": 5300}, {"loss": 1.1178, "grad_norm": 1.244888424873352, "learning_rate": 0.0002, "epoch": 6.9140625, "step": 5310}, {"loss": 1.0868, "grad_norm": 1.1424126625061035, "learning_rate": 0.0002, "epoch": 6.927083333333333, "step": 5320}, {"loss": 1.0768, "grad_norm": 1.1804956197738647, "learning_rate": 0.0002, "epoch": 6.940104166666667, "step": 5330}, {"loss": 1.0803, "grad_norm": 1.3943406343460083, "learning_rate": 0.0002, "epoch": 6.953125, "step": 5340}, {"loss": 1.0573, "grad_norm": 1.3278584480285645, "learning_rate": 0.0002, "epoch": 6.966145833333333, "step": 5350}, {"loss": 1.1008, "grad_norm": 1.3579362630844116, "learning_rate": 0.0002, "epoch": 6.979166666666667, "step": 5360}, {"loss": 1.059, "grad_norm": 1.2172175645828247, "learning_rate": 0.0002, "epoch": 6.9921875, "step": 5370}]} +{"epoch": 8.0, "step": 6144, "epoch_duration": 925.2596716880798, "total_accumulated_duration": 15415.20847940445, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.6-num-4217-sd-10000/checkpoint-1536", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.6589, "grad_norm": 0.513252854347229, "learning_rate": 0.0002, "epoch": 0.013020833333333334, "step": 10}, {"loss": 2.307, "grad_norm": 0.5675475001335144, "learning_rate": 0.0002, "epoch": 0.026041666666666668, "step": 20}, {"loss": 2.0492, "grad_norm": 0.5074710845947266, "learning_rate": 0.0002, "epoch": 0.0390625, "step": 30}, {"loss": 2.0109, "grad_norm": 0.7609530687332153, "learning_rate": 0.0002, "epoch": 0.052083333333333336, "step": 40}, {"loss": 1.8852, "grad_norm": 0.5691684484481812, "learning_rate": 0.0002, "epoch": 0.06510416666666667, "step": 50}, {"loss": 1.8763, "grad_norm": 0.5346821546554565, "learning_rate": 0.0002, "epoch": 0.078125, "step": 60}, {"loss": 1.8639, "grad_norm": 0.46337810158729553, "learning_rate": 0.0002, "epoch": 0.09114583333333333, "step": 70}, {"loss": 1.8124, "grad_norm": 0.4698766767978668, "learning_rate": 0.0002, "epoch": 0.10416666666666667, "step": 80}, {"loss": 1.8101, "grad_norm": 0.43780726194381714, "learning_rate": 0.0002, "epoch": 0.1171875, "step": 90}, {"loss": 1.8044, "grad_norm": 0.9183378219604492, "learning_rate": 0.0002, "epoch": 0.13020833333333334, "step": 100}, {"loss": 1.9022, "grad_norm": 0.44829392433166504, "learning_rate": 0.0002, "epoch": 0.14322916666666666, "step": 110}, {"loss": 1.8906, "grad_norm": 0.3734739422798157, "learning_rate": 0.0002, "epoch": 0.15625, "step": 120}, {"loss": 1.8302, "grad_norm": 0.4368326663970947, "learning_rate": 0.0002, "epoch": 0.16927083333333334, "step": 130}, {"loss": 1.898, "grad_norm": 0.3962480127811432, "learning_rate": 0.0002, "epoch": 0.18229166666666666, "step": 140}, {"loss": 1.8136, "grad_norm": 0.4569706916809082, "learning_rate": 0.0002, "epoch": 0.1953125, "step": 150}, {"loss": 1.8676, "grad_norm": 0.4076327383518219, "learning_rate": 0.0002, "epoch": 0.20833333333333334, "step": 160}, {"loss": 1.7927, "grad_norm": 0.4026809632778168, "learning_rate": 0.0002, "epoch": 0.22135416666666666, "step": 170}, {"loss": 1.8999, "grad_norm": 0.40455079078674316, "learning_rate": 0.0002, "epoch": 0.234375, "step": 180}, {"loss": 1.8397, "grad_norm": 0.40840157866477966, "learning_rate": 0.0002, "epoch": 0.24739583333333334, "step": 190}, {"loss": 1.7216, "grad_norm": 0.4101830720901489, "learning_rate": 0.0002, "epoch": 0.2604166666666667, "step": 200}, {"loss": 1.8106, "grad_norm": 0.3911910057067871, "learning_rate": 0.0002, "epoch": 0.2734375, "step": 210}, {"loss": 1.8519, "grad_norm": 0.4409257173538208, "learning_rate": 0.0002, "epoch": 0.2864583333333333, "step": 220}, {"loss": 1.8192, "grad_norm": 0.39020729064941406, "learning_rate": 0.0002, "epoch": 0.2994791666666667, "step": 230}, {"loss": 1.7586, "grad_norm": 0.4311807155609131, "learning_rate": 0.0002, "epoch": 0.3125, "step": 240}, {"loss": 1.7477, "grad_norm": 0.3851333558559418, "learning_rate": 0.0002, "epoch": 0.3255208333333333, "step": 250}, {"loss": 1.7896, "grad_norm": 0.37738412618637085, "learning_rate": 0.0002, "epoch": 0.3385416666666667, "step": 260}, {"loss": 1.783, "grad_norm": 0.3525104820728302, "learning_rate": 0.0002, "epoch": 0.3515625, "step": 270}, {"loss": 1.7724, "grad_norm": 0.418957382440567, "learning_rate": 0.0002, "epoch": 0.3645833333333333, "step": 280}, {"loss": 1.7989, "grad_norm": 0.40066027641296387, "learning_rate": 0.0002, "epoch": 0.3776041666666667, "step": 290}, {"loss": 1.7294, "grad_norm": 0.379321813583374, "learning_rate": 0.0002, "epoch": 0.390625, "step": 300}, {"loss": 1.869, "grad_norm": 0.35400667786598206, "learning_rate": 0.0002, "epoch": 0.4036458333333333, "step": 310}, {"loss": 1.7546, "grad_norm": 0.6621660590171814, "learning_rate": 0.0002, "epoch": 0.4166666666666667, "step": 320}, {"loss": 1.8251, "grad_norm": 0.3783826529979706, "learning_rate": 0.0002, "epoch": 0.4296875, "step": 330}, {"loss": 1.688, "grad_norm": 0.3920382857322693, "learning_rate": 0.0002, "epoch": 0.4427083333333333, "step": 340}, {"loss": 1.8204, "grad_norm": 0.3657408654689789, "learning_rate": 0.0002, "epoch": 0.4557291666666667, "step": 350}, {"loss": 1.7719, "grad_norm": 0.3717544674873352, "learning_rate": 0.0002, "epoch": 0.46875, "step": 360}, {"loss": 1.7863, "grad_norm": 0.33955204486846924, "learning_rate": 0.0002, "epoch": 0.4817708333333333, "step": 370}, {"loss": 1.7751, "grad_norm": 0.33888939023017883, "learning_rate": 0.0002, "epoch": 0.4947916666666667, "step": 380}, {"loss": 1.7366, "grad_norm": 0.3748014271259308, "learning_rate": 0.0002, "epoch": 0.5078125, "step": 390}, {"loss": 1.7946, "grad_norm": 0.37372609972953796, "learning_rate": 0.0002, "epoch": 0.5208333333333334, "step": 400}, {"loss": 1.7604, "grad_norm": 0.4089180827140808, "learning_rate": 0.0002, "epoch": 0.5338541666666666, "step": 410}, {"loss": 1.7767, "grad_norm": 0.38470903038978577, "learning_rate": 0.0002, "epoch": 0.546875, "step": 420}, {"loss": 1.814, "grad_norm": 0.33426186442375183, "learning_rate": 0.0002, "epoch": 0.5598958333333334, "step": 430}, {"loss": 1.6738, "grad_norm": 0.3802422285079956, "learning_rate": 0.0002, "epoch": 0.5729166666666666, "step": 440}, {"loss": 1.7983, "grad_norm": 0.3245152533054352, "learning_rate": 0.0002, "epoch": 0.5859375, "step": 450}, {"loss": 1.7298, "grad_norm": 0.34128233790397644, "learning_rate": 0.0002, "epoch": 0.5989583333333334, "step": 460}, {"loss": 1.7947, "grad_norm": 0.33154451847076416, "learning_rate": 0.0002, "epoch": 0.6119791666666666, "step": 470}, {"loss": 1.7417, "grad_norm": 0.34642690420150757, "learning_rate": 0.0002, "epoch": 0.625, "step": 480}, {"loss": 1.7242, "grad_norm": 0.37599194049835205, "learning_rate": 0.0002, "epoch": 0.6380208333333334, "step": 490}, {"loss": 1.7591, "grad_norm": 0.4088667333126068, "learning_rate": 0.0002, "epoch": 0.6510416666666666, "step": 500}, {"loss": 1.7216, "grad_norm": 0.35734823346138, "learning_rate": 0.0002, "epoch": 0.6640625, "step": 510}, {"loss": 1.8128, "grad_norm": 0.38925203680992126, "learning_rate": 0.0002, "epoch": 0.6770833333333334, "step": 520}, {"loss": 1.7671, "grad_norm": 0.3787044584751129, "learning_rate": 0.0002, "epoch": 0.6901041666666666, "step": 530}, {"loss": 1.8375, "grad_norm": 0.35195621848106384, "learning_rate": 0.0002, "epoch": 0.703125, "step": 540}, {"loss": 1.7469, "grad_norm": 0.39059996604919434, "learning_rate": 0.0002, "epoch": 0.7161458333333334, "step": 550}, {"loss": 1.7351, "grad_norm": 0.5075398683547974, "learning_rate": 0.0002, "epoch": 0.7291666666666666, "step": 560}, {"loss": 1.7276, "grad_norm": 0.4286627471446991, "learning_rate": 0.0002, "epoch": 0.7421875, "step": 570}, {"loss": 1.8418, "grad_norm": 0.33405354619026184, "learning_rate": 0.0002, "epoch": 0.7552083333333334, "step": 580}, {"loss": 1.7724, "grad_norm": 0.37269648909568787, "learning_rate": 0.0002, "epoch": 0.7682291666666666, "step": 590}, {"loss": 1.7658, "grad_norm": 0.3618223965167999, "learning_rate": 0.0002, "epoch": 0.78125, "step": 600}, {"loss": 1.7717, "grad_norm": 0.33787694573402405, "learning_rate": 0.0002, "epoch": 0.7942708333333334, "step": 610}, {"loss": 1.8033, "grad_norm": 0.4018900990486145, "learning_rate": 0.0002, "epoch": 0.8072916666666666, "step": 620}, {"loss": 1.8206, "grad_norm": 0.3892900049686432, "learning_rate": 0.0002, "epoch": 0.8203125, "step": 630}, {"loss": 1.7331, "grad_norm": 0.33400827646255493, "learning_rate": 0.0002, "epoch": 0.8333333333333334, "step": 640}, {"loss": 1.7139, "grad_norm": 0.3237822353839874, "learning_rate": 0.0002, "epoch": 0.8463541666666666, "step": 650}, {"loss": 1.8172, "grad_norm": 0.35551393032073975, "learning_rate": 0.0002, "epoch": 0.859375, "step": 660}, {"loss": 1.8265, "grad_norm": 0.38883528113365173, "learning_rate": 0.0002, "epoch": 0.8723958333333334, "step": 670}, {"loss": 1.7841, "grad_norm": 0.35139647126197815, "learning_rate": 0.0002, "epoch": 0.8854166666666666, "step": 680}, {"loss": 1.7591, "grad_norm": 0.3403511941432953, "learning_rate": 0.0002, "epoch": 0.8984375, "step": 690}, {"loss": 1.7224, "grad_norm": 0.32814469933509827, "learning_rate": 0.0002, "epoch": 0.9114583333333334, "step": 700}, {"loss": 1.7968, "grad_norm": 0.3933236598968506, "learning_rate": 0.0002, "epoch": 0.9244791666666666, "step": 710}, {"loss": 1.7249, "grad_norm": 0.3436862528324127, "learning_rate": 0.0002, "epoch": 0.9375, "step": 720}, {"loss": 1.7717, "grad_norm": 0.32683226466178894, "learning_rate": 0.0002, "epoch": 0.9505208333333334, "step": 730}, {"loss": 1.7511, "grad_norm": 0.32675468921661377, "learning_rate": 0.0002, "epoch": 0.9635416666666666, "step": 740}, {"loss": 1.7429, "grad_norm": 0.371297150850296, "learning_rate": 0.0002, "epoch": 0.9765625, "step": 750}, {"loss": 1.777, "grad_norm": 0.39658334851264954, "learning_rate": 0.0002, "epoch": 0.9895833333333334, "step": 760}, {"eval_loss": 1.8215787410736084, "eval_runtime": 102.4906, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.634, "epoch": 1.0, "step": 768}, {"loss": 1.8072, "grad_norm": 0.303970068693161, "learning_rate": 0.0002, "epoch": 1.0026041666666667, "step": 770}, {"loss": 1.6708, "grad_norm": 0.32745876908302307, "learning_rate": 0.0002, "epoch": 1.015625, "step": 780}, {"loss": 1.623, "grad_norm": 0.33467888832092285, "learning_rate": 0.0002, "epoch": 1.0286458333333333, "step": 790}, {"loss": 1.746, "grad_norm": 0.38253068923950195, "learning_rate": 0.0002, "epoch": 1.0416666666666667, "step": 800}, {"loss": 1.685, "grad_norm": 0.3955802023410797, "learning_rate": 0.0002, "epoch": 1.0546875, "step": 810}, {"loss": 1.7395, "grad_norm": 0.3534117043018341, "learning_rate": 0.0002, "epoch": 1.0677083333333333, "step": 820}, {"loss": 1.6361, "grad_norm": 0.33427858352661133, "learning_rate": 0.0002, "epoch": 1.0807291666666667, "step": 830}, {"loss": 1.7435, "grad_norm": 0.35261571407318115, "learning_rate": 0.0002, "epoch": 1.09375, "step": 840}, {"loss": 1.7112, "grad_norm": 0.4416263997554779, "learning_rate": 0.0002, "epoch": 1.1067708333333333, "step": 850}, {"loss": 1.6311, "grad_norm": 0.3918050229549408, "learning_rate": 0.0002, "epoch": 1.1197916666666667, "step": 860}, {"loss": 1.6804, "grad_norm": 0.38482677936553955, "learning_rate": 0.0002, "epoch": 1.1328125, "step": 870}, {"loss": 1.6951, "grad_norm": 0.4945143759250641, "learning_rate": 0.0002, "epoch": 1.1458333333333333, "step": 880}, {"loss": 1.7577, "grad_norm": 0.429677814245224, "learning_rate": 0.0002, "epoch": 1.1588541666666667, "step": 890}, {"loss": 1.7204, "grad_norm": 0.41878288984298706, "learning_rate": 0.0002, "epoch": 1.171875, "step": 900}, {"loss": 1.717, "grad_norm": 0.41578373312950134, "learning_rate": 0.0002, "epoch": 1.1848958333333333, "step": 910}, {"loss": 1.7017, "grad_norm": 0.37028902769088745, "learning_rate": 0.0002, "epoch": 1.1979166666666667, "step": 920}, {"loss": 1.7074, "grad_norm": 0.3824995756149292, "learning_rate": 0.0002, "epoch": 1.2109375, "step": 930}, {"loss": 1.6185, "grad_norm": 0.3818865418434143, "learning_rate": 0.0002, "epoch": 1.2239583333333333, "step": 940}, {"loss": 1.7894, "grad_norm": 0.3930460810661316, "learning_rate": 0.0002, "epoch": 1.2369791666666667, "step": 950}, {"loss": 1.6766, "grad_norm": 0.3904426395893097, "learning_rate": 0.0002, "epoch": 1.25, "step": 960}, {"loss": 1.7072, "grad_norm": 0.4175802171230316, "learning_rate": 0.0002, "epoch": 1.2630208333333333, "step": 970}, {"loss": 1.7556, "grad_norm": 0.42343786358833313, "learning_rate": 0.0002, "epoch": 1.2760416666666667, "step": 980}, {"loss": 1.6339, "grad_norm": 0.4168420135974884, "learning_rate": 0.0002, "epoch": 1.2890625, "step": 990}, {"loss": 1.727, "grad_norm": 0.38692983984947205, "learning_rate": 0.0002, "epoch": 1.3020833333333333, "step": 1000}, {"loss": 1.6384, "grad_norm": 0.5037692189216614, "learning_rate": 0.0002, "epoch": 1.3151041666666667, "step": 1010}, {"loss": 1.6878, "grad_norm": 0.39436691999435425, "learning_rate": 0.0002, "epoch": 1.328125, "step": 1020}, {"loss": 1.7113, "grad_norm": 0.3431943356990814, "learning_rate": 0.0002, "epoch": 1.3411458333333333, "step": 1030}, {"loss": 1.7034, "grad_norm": 0.39167070388793945, "learning_rate": 0.0002, "epoch": 1.3541666666666667, "step": 1040}, {"loss": 1.7108, "grad_norm": 0.3820446729660034, "learning_rate": 0.0002, "epoch": 1.3671875, "step": 1050}, {"loss": 1.7885, "grad_norm": 0.4190749526023865, "learning_rate": 0.0002, "epoch": 1.3802083333333333, "step": 1060}, {"loss": 1.7548, "grad_norm": 0.3618869185447693, "learning_rate": 0.0002, "epoch": 1.3932291666666667, "step": 1070}, {"loss": 1.6199, "grad_norm": 0.38852423429489136, "learning_rate": 0.0002, "epoch": 1.40625, "step": 1080}, {"loss": 1.733, "grad_norm": 0.49829256534576416, "learning_rate": 0.0002, "epoch": 1.4192708333333333, "step": 1090}, {"loss": 1.6589, "grad_norm": 0.3956700563430786, "learning_rate": 0.0002, "epoch": 1.4322916666666667, "step": 1100}, {"loss": 1.5866, "grad_norm": 0.38829147815704346, "learning_rate": 0.0002, "epoch": 1.4453125, "step": 1110}, {"loss": 1.6709, "grad_norm": 0.37237483263015747, "learning_rate": 0.0002, "epoch": 1.4583333333333333, "step": 1120}, {"loss": 1.64, "grad_norm": 0.39798808097839355, "learning_rate": 0.0002, "epoch": 1.4713541666666667, "step": 1130}, {"loss": 1.7484, "grad_norm": 0.38188642263412476, "learning_rate": 0.0002, "epoch": 1.484375, "step": 1140}, {"loss": 1.6707, "grad_norm": 0.44961944222450256, "learning_rate": 0.0002, "epoch": 1.4973958333333333, "step": 1150}, {"loss": 1.6241, "grad_norm": 0.3816550374031067, "learning_rate": 0.0002, "epoch": 1.5104166666666665, "step": 1160}, {"loss": 1.7606, "grad_norm": 0.3885478973388672, "learning_rate": 0.0002, "epoch": 1.5234375, "step": 1170}, {"loss": 1.7285, "grad_norm": 0.42779695987701416, "learning_rate": 0.0002, "epoch": 1.5364583333333335, "step": 1180}, {"loss": 1.7399, "grad_norm": 0.41499748826026917, "learning_rate": 0.0002, "epoch": 1.5494791666666665, "step": 1190}, {"loss": 1.6569, "grad_norm": 0.4319412410259247, "learning_rate": 0.0002, "epoch": 1.5625, "step": 1200}, {"loss": 1.7297, "grad_norm": 0.38847389817237854, "learning_rate": 0.0002, "epoch": 1.5755208333333335, "step": 1210}, {"loss": 1.6666, "grad_norm": 0.45832890272140503, "learning_rate": 0.0002, "epoch": 1.5885416666666665, "step": 1220}, {"loss": 1.68, "grad_norm": 0.45928797125816345, "learning_rate": 0.0002, "epoch": 1.6015625, "step": 1230}, {"loss": 1.7225, "grad_norm": 0.4052276611328125, "learning_rate": 0.0002, "epoch": 1.6145833333333335, "step": 1240}, {"loss": 1.6722, "grad_norm": 0.4031650424003601, "learning_rate": 0.0002, "epoch": 1.6276041666666665, "step": 1250}, {"loss": 1.7243, "grad_norm": 0.36724114418029785, "learning_rate": 0.0002, "epoch": 1.640625, "step": 1260}, {"loss": 1.7672, "grad_norm": 0.4188505709171295, "learning_rate": 0.0002, "epoch": 1.6536458333333335, "step": 1270}, {"loss": 1.7685, "grad_norm": 0.3982168138027191, "learning_rate": 0.0002, "epoch": 1.6666666666666665, "step": 1280}, {"loss": 1.6831, "grad_norm": 0.3768596053123474, "learning_rate": 0.0002, "epoch": 1.6796875, "step": 1290}, {"loss": 1.6868, "grad_norm": 0.3843287527561188, "learning_rate": 0.0002, "epoch": 1.6927083333333335, "step": 1300}, {"loss": 1.6188, "grad_norm": 0.3982345461845398, "learning_rate": 0.0002, "epoch": 1.7057291666666665, "step": 1310}, {"loss": 1.7084, "grad_norm": 0.3407546281814575, "learning_rate": 0.0002, "epoch": 1.71875, "step": 1320}, {"loss": 1.7316, "grad_norm": 0.36327359080314636, "learning_rate": 0.0002, "epoch": 1.7317708333333335, "step": 1330}, {"loss": 1.734, "grad_norm": 0.4141675531864166, "learning_rate": 0.0002, "epoch": 1.7447916666666665, "step": 1340}, {"loss": 1.7257, "grad_norm": 0.43894267082214355, "learning_rate": 0.0002, "epoch": 1.7578125, "step": 1350}, {"loss": 1.6613, "grad_norm": 0.40564292669296265, "learning_rate": 0.0002, "epoch": 1.7708333333333335, "step": 1360}, {"loss": 1.6841, "grad_norm": 0.3978462815284729, "learning_rate": 0.0002, "epoch": 1.7838541666666665, "step": 1370}, {"loss": 1.6497, "grad_norm": 0.37140771746635437, "learning_rate": 0.0002, "epoch": 1.796875, "step": 1380}, {"loss": 1.742, "grad_norm": 0.43164145946502686, "learning_rate": 0.0002, "epoch": 1.8098958333333335, "step": 1390}, {"loss": 1.7253, "grad_norm": 0.38034674525260925, "learning_rate": 0.0002, "epoch": 1.8229166666666665, "step": 1400}, {"loss": 1.652, "grad_norm": 0.4235687851905823, "learning_rate": 0.0002, "epoch": 1.8359375, "step": 1410}, {"loss": 1.752, "grad_norm": 0.37417489290237427, "learning_rate": 0.0002, "epoch": 1.8489583333333335, "step": 1420}, {"loss": 1.6995, "grad_norm": 0.4303789734840393, "learning_rate": 0.0002, "epoch": 1.8619791666666665, "step": 1430}, {"loss": 1.6489, "grad_norm": 0.43942129611968994, "learning_rate": 0.0002, "epoch": 1.875, "step": 1440}, {"loss": 1.7989, "grad_norm": 0.3866581320762634, "learning_rate": 0.0002, "epoch": 1.8880208333333335, "step": 1450}, {"loss": 1.72, "grad_norm": 0.3686903417110443, "learning_rate": 0.0002, "epoch": 1.9010416666666665, "step": 1460}, {"loss": 1.6545, "grad_norm": 0.3885461986064911, "learning_rate": 0.0002, "epoch": 1.9140625, "step": 1470}, {"loss": 1.6981, "grad_norm": 0.4156927466392517, "learning_rate": 0.0002, "epoch": 1.9270833333333335, "step": 1480}, {"loss": 1.5921, "grad_norm": 0.3934236168861389, "learning_rate": 0.0002, "epoch": 1.9401041666666665, "step": 1490}, {"loss": 1.7384, "grad_norm": 0.38645586371421814, "learning_rate": 0.0002, "epoch": 1.953125, "step": 1500}, {"loss": 1.7033, "grad_norm": 0.43272635340690613, "learning_rate": 0.0002, "epoch": 1.9661458333333335, "step": 1510}, {"loss": 1.6138, "grad_norm": 0.42476025223731995, "learning_rate": 0.0002, "epoch": 1.9791666666666665, "step": 1520}, {"loss": 1.5834, "grad_norm": 0.37216147780418396, "learning_rate": 0.0002, "epoch": 1.9921875, "step": 1530}, {"eval_loss": 1.820037841796875, "eval_runtime": 101.0456, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.643, "epoch": 2.0, "step": 1536}, {"loss": 1.6395, "grad_norm": 0.39003029465675354, "learning_rate": 0.0002, "epoch": 2.0052083333333335, "step": 1540}, {"loss": 1.5447, "grad_norm": 0.4302637577056885, "learning_rate": 0.0002, "epoch": 2.0182291666666665, "step": 1550}, {"loss": 1.5951, "grad_norm": 0.4496043026447296, "learning_rate": 0.0002, "epoch": 2.03125, "step": 1560}, {"loss": 1.6032, "grad_norm": 0.42824679613113403, "learning_rate": 0.0002, "epoch": 2.0442708333333335, "step": 1570}, {"loss": 1.5996, "grad_norm": 0.44775739312171936, "learning_rate": 0.0002, "epoch": 2.0572916666666665, "step": 1580}, {"loss": 1.571, "grad_norm": 0.4705299735069275, "learning_rate": 0.0002, "epoch": 2.0703125, "step": 1590}, {"loss": 1.7589, "grad_norm": 0.4614814817905426, "learning_rate": 0.0002, "epoch": 2.0833333333333335, "step": 1600}, {"loss": 1.5762, "grad_norm": 0.45097213983535767, "learning_rate": 0.0002, "epoch": 2.0963541666666665, "step": 1610}, {"loss": 1.4947, "grad_norm": 0.41954323649406433, "learning_rate": 0.0002, "epoch": 2.109375, "step": 1620}, {"loss": 1.6397, "grad_norm": 0.44894352555274963, "learning_rate": 0.0002, "epoch": 2.1223958333333335, "step": 1630}, {"loss": 1.5251, "grad_norm": 0.4421502947807312, "learning_rate": 0.0002, "epoch": 2.1354166666666665, "step": 1640}, {"loss": 1.5931, "grad_norm": 0.44649967551231384, "learning_rate": 0.0002, "epoch": 2.1484375, "step": 1650}, {"loss": 1.6327, "grad_norm": 0.44216716289520264, "learning_rate": 0.0002, "epoch": 2.1614583333333335, "step": 1660}, {"loss": 1.5924, "grad_norm": 0.6363232135772705, "learning_rate": 0.0002, "epoch": 2.1744791666666665, "step": 1670}, {"loss": 1.6151, "grad_norm": 0.46533334255218506, "learning_rate": 0.0002, "epoch": 2.1875, "step": 1680}, {"loss": 1.5539, "grad_norm": 0.48486822843551636, "learning_rate": 0.0002, "epoch": 2.2005208333333335, "step": 1690}, {"loss": 1.6322, "grad_norm": 0.43277066946029663, "learning_rate": 0.0002, "epoch": 2.2135416666666665, "step": 1700}, {"loss": 1.4979, "grad_norm": 0.45927226543426514, "learning_rate": 0.0002, "epoch": 2.2265625, "step": 1710}, {"loss": 1.5917, "grad_norm": 0.4654010236263275, "learning_rate": 0.0002, "epoch": 2.2395833333333335, "step": 1720}, {"loss": 1.5713, "grad_norm": 0.49796584248542786, "learning_rate": 0.0002, "epoch": 2.2526041666666665, "step": 1730}, {"loss": 1.587, "grad_norm": 0.4506736397743225, "learning_rate": 0.0002, "epoch": 2.265625, "step": 1740}, {"loss": 1.5961, "grad_norm": 0.46757954359054565, "learning_rate": 0.0002, "epoch": 2.2786458333333335, "step": 1750}, {"loss": 1.6307, "grad_norm": 0.4507335424423218, "learning_rate": 0.0002, "epoch": 2.2916666666666665, "step": 1760}, {"loss": 1.5905, "grad_norm": 0.43900197744369507, "learning_rate": 0.0002, "epoch": 2.3046875, "step": 1770}, {"loss": 1.6655, "grad_norm": 0.48013004660606384, "learning_rate": 0.0002, "epoch": 2.3177083333333335, "step": 1780}, {"loss": 1.6024, "grad_norm": 0.41891220211982727, "learning_rate": 0.0002, "epoch": 2.3307291666666665, "step": 1790}, {"loss": 1.658, "grad_norm": 0.4879191219806671, "learning_rate": 0.0002, "epoch": 2.34375, "step": 1800}, {"loss": 1.6084, "grad_norm": 0.46148231625556946, "learning_rate": 0.0002, "epoch": 2.3567708333333335, "step": 1810}, {"loss": 1.6072, "grad_norm": 0.5114223957061768, "learning_rate": 0.0002, "epoch": 2.3697916666666665, "step": 1820}, {"loss": 1.5505, "grad_norm": 0.4828612804412842, "learning_rate": 0.0002, "epoch": 2.3828125, "step": 1830}, {"loss": 1.571, "grad_norm": 0.4672335386276245, "learning_rate": 0.0002, "epoch": 2.3958333333333335, "step": 1840}, {"loss": 1.6156, "grad_norm": 0.4914792776107788, "learning_rate": 0.0002, "epoch": 2.4088541666666665, "step": 1850}, {"loss": 1.5356, "grad_norm": 0.44478079676628113, "learning_rate": 0.0002, "epoch": 2.421875, "step": 1860}, {"loss": 1.7262, "grad_norm": 0.4601325988769531, "learning_rate": 0.0002, "epoch": 2.4348958333333335, "step": 1870}, {"loss": 1.555, "grad_norm": 0.44539815187454224, "learning_rate": 0.0002, "epoch": 2.4479166666666665, "step": 1880}, {"loss": 1.5877, "grad_norm": 0.4532422125339508, "learning_rate": 0.0002, "epoch": 2.4609375, "step": 1890}, {"loss": 1.5574, "grad_norm": 0.5323562622070312, "learning_rate": 0.0002, "epoch": 2.4739583333333335, "step": 1900}, {"loss": 1.7014, "grad_norm": 0.5027516484260559, "learning_rate": 0.0002, "epoch": 2.4869791666666665, "step": 1910}, {"loss": 1.5471, "grad_norm": 0.4507808983325958, "learning_rate": 0.0002, "epoch": 2.5, "step": 1920}, {"loss": 1.613, "grad_norm": 0.4996422827243805, "learning_rate": 0.0002, "epoch": 2.5130208333333335, "step": 1930}, {"loss": 1.6412, "grad_norm": 0.4964800179004669, "learning_rate": 0.0002, "epoch": 2.5260416666666665, "step": 1940}, {"loss": 1.547, "grad_norm": 0.48546481132507324, "learning_rate": 0.0002, "epoch": 2.5390625, "step": 1950}, {"loss": 1.6075, "grad_norm": 0.47357916831970215, "learning_rate": 0.0002, "epoch": 2.5520833333333335, "step": 1960}, {"loss": 1.5585, "grad_norm": 0.47136595845222473, "learning_rate": 0.0002, "epoch": 2.5651041666666665, "step": 1970}, {"loss": 1.5157, "grad_norm": 0.5185502171516418, "learning_rate": 0.0002, "epoch": 2.578125, "step": 1980}, {"loss": 1.6904, "grad_norm": 0.47995880246162415, "learning_rate": 0.0002, "epoch": 2.5911458333333335, "step": 1990}, {"loss": 1.638, "grad_norm": 0.5076674222946167, "learning_rate": 0.0002, "epoch": 2.6041666666666665, "step": 2000}, {"loss": 1.6038, "grad_norm": 0.4805421233177185, "learning_rate": 0.0002, "epoch": 2.6171875, "step": 2010}, {"loss": 1.6092, "grad_norm": 0.4406864047050476, "learning_rate": 0.0002, "epoch": 2.6302083333333335, "step": 2020}, {"loss": 1.6036, "grad_norm": 0.521388828754425, "learning_rate": 0.0002, "epoch": 2.6432291666666665, "step": 2030}, {"loss": 1.5338, "grad_norm": 0.4531918466091156, "learning_rate": 0.0002, "epoch": 2.65625, "step": 2040}, {"loss": 1.6853, "grad_norm": 0.45295774936676025, "learning_rate": 0.0002, "epoch": 2.6692708333333335, "step": 2050}, {"loss": 1.5252, "grad_norm": 0.4573723375797272, "learning_rate": 0.0002, "epoch": 2.6822916666666665, "step": 2060}, {"loss": 1.5765, "grad_norm": 0.4836064279079437, "learning_rate": 0.0002, "epoch": 2.6953125, "step": 2070}, {"loss": 1.5928, "grad_norm": 0.5040885210037231, "learning_rate": 0.0002, "epoch": 2.7083333333333335, "step": 2080}, {"loss": 1.6438, "grad_norm": 0.5153458118438721, "learning_rate": 0.0002, "epoch": 2.7213541666666665, "step": 2090}, {"loss": 1.5917, "grad_norm": 0.4415692090988159, "learning_rate": 0.0002, "epoch": 2.734375, "step": 2100}, {"loss": 1.6017, "grad_norm": 0.4862712621688843, "learning_rate": 0.0002, "epoch": 2.7473958333333335, "step": 2110}, {"loss": 1.5797, "grad_norm": 0.4845922589302063, "learning_rate": 0.0002, "epoch": 2.7604166666666665, "step": 2120}, {"loss": 1.6404, "grad_norm": 0.5153566598892212, "learning_rate": 0.0002, "epoch": 2.7734375, "step": 2130}, {"loss": 1.5609, "grad_norm": 0.4220491945743561, "learning_rate": 0.0002, "epoch": 2.7864583333333335, "step": 2140}, {"loss": 1.5404, "grad_norm": 0.523292064666748, "learning_rate": 0.0002, "epoch": 2.7994791666666665, "step": 2150}, {"loss": 1.4993, "grad_norm": 0.4567972421646118, "learning_rate": 0.0002, "epoch": 2.8125, "step": 2160}, {"loss": 1.6279, "grad_norm": 0.6252557039260864, "learning_rate": 0.0002, "epoch": 2.8255208333333335, "step": 2170}, {"loss": 1.6203, "grad_norm": 0.5231373310089111, "learning_rate": 0.0002, "epoch": 2.8385416666666665, "step": 2180}, {"loss": 1.5707, "grad_norm": 0.49243974685668945, "learning_rate": 0.0002, "epoch": 2.8515625, "step": 2190}, {"loss": 1.5923, "grad_norm": 0.521644115447998, "learning_rate": 0.0002, "epoch": 2.8645833333333335, "step": 2200}, {"loss": 1.6812, "grad_norm": 0.4624195694923401, "learning_rate": 0.0002, "epoch": 2.8776041666666665, "step": 2210}, {"loss": 1.6132, "grad_norm": 0.4463620185852051, "learning_rate": 0.0002, "epoch": 2.890625, "step": 2220}, {"loss": 1.6095, "grad_norm": 0.45793524384498596, "learning_rate": 0.0002, "epoch": 2.9036458333333335, "step": 2230}, {"loss": 1.5985, "grad_norm": 0.46979188919067383, "learning_rate": 0.0002, "epoch": 2.9166666666666665, "step": 2240}, {"loss": 1.617, "grad_norm": 0.5220303535461426, "learning_rate": 0.0002, "epoch": 2.9296875, "step": 2250}, {"loss": 1.5978, "grad_norm": 0.44405895471572876, "learning_rate": 0.0002, "epoch": 2.9427083333333335, "step": 2260}, {"loss": 1.6685, "grad_norm": 0.523841381072998, "learning_rate": 0.0002, "epoch": 2.9557291666666665, "step": 2270}, {"loss": 1.595, "grad_norm": 0.4928138852119446, "learning_rate": 0.0002, "epoch": 2.96875, "step": 2280}, {"loss": 1.606, "grad_norm": 0.4918071925640106, "learning_rate": 0.0002, "epoch": 2.9817708333333335, "step": 2290}, {"loss": 1.5736, "grad_norm": 0.4584912061691284, "learning_rate": 0.0002, "epoch": 2.9947916666666665, "step": 2300}, {"eval_loss": 1.8474308252334595, "eval_runtime": 103.7697, "eval_samples_per_second": 4.963, "eval_steps_per_second": 0.626, "epoch": 3.0, "step": 2304}, {"loss": 1.5454, "grad_norm": 0.4801871180534363, "learning_rate": 0.0002, "epoch": 3.0078125, "step": 2310}, {"loss": 1.4019, "grad_norm": 0.5789998173713684, "learning_rate": 0.0002, "epoch": 3.0208333333333335, "step": 2320}, {"loss": 1.4419, "grad_norm": 0.49856704473495483, "learning_rate": 0.0002, "epoch": 3.0338541666666665, "step": 2330}, {"loss": 1.4718, "grad_norm": 0.5625631213188171, "learning_rate": 0.0002, "epoch": 3.046875, "step": 2340}, {"loss": 1.4727, "grad_norm": 0.557637095451355, "learning_rate": 0.0002, "epoch": 3.0598958333333335, "step": 2350}, {"loss": 1.4654, "grad_norm": 0.528889536857605, "learning_rate": 0.0002, "epoch": 3.0729166666666665, "step": 2360}, {"loss": 1.4307, "grad_norm": 0.5952284932136536, "learning_rate": 0.0002, "epoch": 3.0859375, "step": 2370}, {"loss": 1.5304, "grad_norm": 0.5549899339675903, "learning_rate": 0.0002, "epoch": 3.0989583333333335, "step": 2380}, {"loss": 1.5034, "grad_norm": 0.662139892578125, "learning_rate": 0.0002, "epoch": 3.1119791666666665, "step": 2390}, {"loss": 1.4754, "grad_norm": 0.5281530618667603, "learning_rate": 0.0002, "epoch": 3.125, "step": 2400}, {"loss": 1.4047, "grad_norm": 0.6134106516838074, "learning_rate": 0.0002, "epoch": 3.1380208333333335, "step": 2410}, {"loss": 1.5001, "grad_norm": 0.6040887236595154, "learning_rate": 0.0002, "epoch": 3.1510416666666665, "step": 2420}, {"loss": 1.3936, "grad_norm": 0.549672544002533, "learning_rate": 0.0002, "epoch": 3.1640625, "step": 2430}, {"loss": 1.401, "grad_norm": 0.9195653796195984, "learning_rate": 0.0002, "epoch": 3.1770833333333335, "step": 2440}, {"loss": 1.507, "grad_norm": 0.5578703284263611, "learning_rate": 0.0002, "epoch": 3.1901041666666665, "step": 2450}, {"loss": 1.4873, "grad_norm": 0.5982925891876221, "learning_rate": 0.0002, "epoch": 3.203125, "step": 2460}, {"loss": 1.4909, "grad_norm": 0.5544393062591553, "learning_rate": 0.0002, "epoch": 3.2161458333333335, "step": 2470}, {"loss": 1.4705, "grad_norm": 0.6015266180038452, "learning_rate": 0.0002, "epoch": 3.2291666666666665, "step": 2480}, {"loss": 1.4652, "grad_norm": 0.5995243191719055, "learning_rate": 0.0002, "epoch": 3.2421875, "step": 2490}, {"loss": 1.4486, "grad_norm": 0.5846129059791565, "learning_rate": 0.0002, "epoch": 3.2552083333333335, "step": 2500}, {"loss": 1.4529, "grad_norm": 0.5552570223808289, "learning_rate": 0.0002, "epoch": 3.2682291666666665, "step": 2510}, {"loss": 1.3884, "grad_norm": 0.576998233795166, "learning_rate": 0.0002, "epoch": 3.28125, "step": 2520}, {"loss": 1.4463, "grad_norm": 0.6526138186454773, "learning_rate": 0.0002, "epoch": 3.2942708333333335, "step": 2530}, {"loss": 1.474, "grad_norm": 0.6064265966415405, "learning_rate": 0.0002, "epoch": 3.3072916666666665, "step": 2540}, {"loss": 1.5125, "grad_norm": 0.5542362928390503, "learning_rate": 0.0002, "epoch": 3.3203125, "step": 2550}, {"loss": 1.4769, "grad_norm": 0.6048482060432434, "learning_rate": 0.0002, "epoch": 3.3333333333333335, "step": 2560}, {"loss": 1.4682, "grad_norm": 0.6328344941139221, "learning_rate": 0.0002, "epoch": 3.3463541666666665, "step": 2570}, {"loss": 1.5647, "grad_norm": 0.6347311735153198, "learning_rate": 0.0002, "epoch": 3.359375, "step": 2580}, {"loss": 1.5752, "grad_norm": 0.537570595741272, "learning_rate": 0.0002, "epoch": 3.3723958333333335, "step": 2590}, {"loss": 1.4086, "grad_norm": 0.5704807639122009, "learning_rate": 0.0002, "epoch": 3.3854166666666665, "step": 2600}, {"loss": 1.5653, "grad_norm": 0.5914373993873596, "learning_rate": 0.0002, "epoch": 3.3984375, "step": 2610}, {"loss": 1.4436, "grad_norm": 0.6724640130996704, "learning_rate": 0.0002, "epoch": 3.4114583333333335, "step": 2620}, {"loss": 1.5731, "grad_norm": 0.6295472383499146, "learning_rate": 0.0002, "epoch": 3.4244791666666665, "step": 2630}, {"loss": 1.4715, "grad_norm": 0.5842770934104919, "learning_rate": 0.0002, "epoch": 3.4375, "step": 2640}, {"loss": 1.451, "grad_norm": 0.6297776699066162, "learning_rate": 0.0002, "epoch": 3.4505208333333335, "step": 2650}, {"loss": 1.5761, "grad_norm": 0.6105847358703613, "learning_rate": 0.0002, "epoch": 3.4635416666666665, "step": 2660}, {"loss": 1.5332, "grad_norm": 0.6294940710067749, "learning_rate": 0.0002, "epoch": 3.4765625, "step": 2670}, {"loss": 1.5451, "grad_norm": 0.6573333740234375, "learning_rate": 0.0002, "epoch": 3.4895833333333335, "step": 2680}, {"loss": 1.4592, "grad_norm": 0.663661539554596, "learning_rate": 0.0002, "epoch": 3.5026041666666665, "step": 2690}, {"loss": 1.5286, "grad_norm": 0.6729148626327515, "learning_rate": 0.0002, "epoch": 3.515625, "step": 2700}, {"loss": 1.534, "grad_norm": 0.6633102893829346, "learning_rate": 0.0002, "epoch": 3.5286458333333335, "step": 2710}, {"loss": 1.4023, "grad_norm": 0.567686915397644, "learning_rate": 0.0002, "epoch": 3.5416666666666665, "step": 2720}, {"loss": 1.4925, "grad_norm": 0.6281962394714355, "learning_rate": 0.0002, "epoch": 3.5546875, "step": 2730}, {"loss": 1.5028, "grad_norm": 0.5710738897323608, "learning_rate": 0.0002, "epoch": 3.5677083333333335, "step": 2740}, {"loss": 1.4393, "grad_norm": 0.648162305355072, "learning_rate": 0.0002, "epoch": 3.5807291666666665, "step": 2750}, {"loss": 1.4294, "grad_norm": 0.5466254949569702, "learning_rate": 0.0002, "epoch": 3.59375, "step": 2760}, {"loss": 1.4993, "grad_norm": 0.6867973208427429, "learning_rate": 0.0002, "epoch": 3.6067708333333335, "step": 2770}, {"loss": 1.4463, "grad_norm": 0.673612117767334, "learning_rate": 0.0002, "epoch": 3.6197916666666665, "step": 2780}, {"loss": 1.5231, "grad_norm": 0.6928417086601257, "learning_rate": 0.0002, "epoch": 3.6328125, "step": 2790}, {"loss": 1.5212, "grad_norm": 0.6603742837905884, "learning_rate": 0.0002, "epoch": 3.6458333333333335, "step": 2800}, {"loss": 1.4889, "grad_norm": 0.5964401960372925, "learning_rate": 0.0002, "epoch": 3.6588541666666665, "step": 2810}, {"loss": 1.4585, "grad_norm": 0.6224474310874939, "learning_rate": 0.0002, "epoch": 3.671875, "step": 2820}, {"loss": 1.5119, "grad_norm": 0.6592439413070679, "learning_rate": 0.0002, "epoch": 3.6848958333333335, "step": 2830}, {"loss": 1.4729, "grad_norm": 0.6255369186401367, "learning_rate": 0.0002, "epoch": 3.6979166666666665, "step": 2840}, {"loss": 1.4598, "grad_norm": 0.7136337757110596, "learning_rate": 0.0002, "epoch": 3.7109375, "step": 2850}, {"loss": 1.4491, "grad_norm": 0.6229757070541382, "learning_rate": 0.0002, "epoch": 3.7239583333333335, "step": 2860}, {"loss": 1.4175, "grad_norm": 0.696080207824707, "learning_rate": 0.0002, "epoch": 3.7369791666666665, "step": 2870}, {"loss": 1.5127, "grad_norm": 0.571873664855957, "learning_rate": 0.0002, "epoch": 3.75, "step": 2880}, {"loss": 1.4093, "grad_norm": 0.5918916463851929, "learning_rate": 0.0002, "epoch": 3.7630208333333335, "step": 2890}, {"loss": 1.399, "grad_norm": 0.616413950920105, "learning_rate": 0.0002, "epoch": 3.7760416666666665, "step": 2900}, {"loss": 1.4215, "grad_norm": 0.6267292499542236, "learning_rate": 0.0002, "epoch": 3.7890625, "step": 2910}, {"loss": 1.5095, "grad_norm": 0.6630783677101135, "learning_rate": 0.0002, "epoch": 3.8020833333333335, "step": 2920}, {"loss": 1.5323, "grad_norm": 0.6004238724708557, "learning_rate": 0.0002, "epoch": 3.8151041666666665, "step": 2930}, {"loss": 1.4953, "grad_norm": 0.6740423440933228, "learning_rate": 0.0002, "epoch": 3.828125, "step": 2940}, {"loss": 1.549, "grad_norm": 0.6397785544395447, "learning_rate": 0.0002, "epoch": 3.8411458333333335, "step": 2950}, {"loss": 1.5309, "grad_norm": 0.6063735485076904, "learning_rate": 0.0002, "epoch": 3.8541666666666665, "step": 2960}, {"loss": 1.5093, "grad_norm": 0.6462053060531616, "learning_rate": 0.0002, "epoch": 3.8671875, "step": 2970}, {"loss": 1.5237, "grad_norm": 0.7143250107765198, "learning_rate": 0.0002, "epoch": 3.8802083333333335, "step": 2980}, {"loss": 1.4419, "grad_norm": 0.6747874617576599, "learning_rate": 0.0002, "epoch": 3.8932291666666665, "step": 2990}, {"loss": 1.5389, "grad_norm": 0.622930109500885, "learning_rate": 0.0002, "epoch": 3.90625, "step": 3000}, {"loss": 1.4279, "grad_norm": 0.620193600654602, "learning_rate": 0.0002, "epoch": 3.9192708333333335, "step": 3010}, {"loss": 1.495, "grad_norm": 0.6321487426757812, "learning_rate": 0.0002, "epoch": 3.9322916666666665, "step": 3020}, {"loss": 1.4657, "grad_norm": 0.5705523490905762, "learning_rate": 0.0002, "epoch": 3.9453125, "step": 3030}, {"loss": 1.4099, "grad_norm": 0.6185072660446167, "learning_rate": 0.0002, "epoch": 3.9583333333333335, "step": 3040}, {"loss": 1.4667, "grad_norm": 0.6005704998970032, "learning_rate": 0.0002, "epoch": 3.9713541666666665, "step": 3050}, {"loss": 1.4896, "grad_norm": 0.5933769941329956, "learning_rate": 0.0002, "epoch": 3.984375, "step": 3060}, {"loss": 1.4973, "grad_norm": 0.695209801197052, "learning_rate": 0.0002, "epoch": 3.9973958333333335, "step": 3070}, {"eval_loss": 1.8955267667770386, "eval_runtime": 103.5061, "eval_samples_per_second": 4.976, "eval_steps_per_second": 0.628, "epoch": 4.0, "step": 3072}, {"loss": 1.3502, "grad_norm": 0.6706188321113586, "learning_rate": 0.0002, "epoch": 4.010416666666667, "step": 3080}, {"loss": 1.2917, "grad_norm": 0.7263980507850647, "learning_rate": 0.0002, "epoch": 4.0234375, "step": 3090}, {"loss": 1.2845, "grad_norm": 0.7767240405082703, "learning_rate": 0.0002, "epoch": 4.036458333333333, "step": 3100}, {"loss": 1.4169, "grad_norm": 0.6888399124145508, "learning_rate": 0.0002, "epoch": 4.049479166666667, "step": 3110}, {"loss": 1.2422, "grad_norm": 0.8860331773757935, "learning_rate": 0.0002, "epoch": 4.0625, "step": 3120}, {"loss": 1.2842, "grad_norm": 0.7572373151779175, "learning_rate": 0.0002, "epoch": 4.075520833333333, "step": 3130}, {"loss": 1.2747, "grad_norm": 0.8321536183357239, "learning_rate": 0.0002, "epoch": 4.088541666666667, "step": 3140}, {"loss": 1.2843, "grad_norm": 0.7042664885520935, "learning_rate": 0.0002, "epoch": 4.1015625, "step": 3150}, {"loss": 1.3326, "grad_norm": 0.8910216689109802, "learning_rate": 0.0002, "epoch": 4.114583333333333, "step": 3160}, {"loss": 1.2742, "grad_norm": 0.8333232402801514, "learning_rate": 0.0002, "epoch": 4.127604166666667, "step": 3170}, {"loss": 1.2985, "grad_norm": 0.7120883464813232, "learning_rate": 0.0002, "epoch": 4.140625, "step": 3180}, {"loss": 1.3611, "grad_norm": 0.6904631853103638, "learning_rate": 0.0002, "epoch": 4.153645833333333, "step": 3190}, {"loss": 1.2881, "grad_norm": 0.6398878693580627, "learning_rate": 0.0002, "epoch": 4.166666666666667, "step": 3200}, {"loss": 1.3323, "grad_norm": 0.7573692798614502, "learning_rate": 0.0002, "epoch": 4.1796875, "step": 3210}, {"loss": 1.3509, "grad_norm": 0.7850743532180786, "learning_rate": 0.0002, "epoch": 4.192708333333333, "step": 3220}, {"loss": 1.3176, "grad_norm": 0.7863165736198425, "learning_rate": 0.0002, "epoch": 4.205729166666667, "step": 3230}, {"loss": 1.3739, "grad_norm": 0.7855865359306335, "learning_rate": 0.0002, "epoch": 4.21875, "step": 3240}, {"loss": 1.3251, "grad_norm": 0.6840922832489014, "learning_rate": 0.0002, "epoch": 4.231770833333333, "step": 3250}, {"loss": 1.32, "grad_norm": 0.8499747514724731, "learning_rate": 0.0002, "epoch": 4.244791666666667, "step": 3260}, {"loss": 1.4045, "grad_norm": 0.7982883453369141, "learning_rate": 0.0002, "epoch": 4.2578125, "step": 3270}, {"loss": 1.3922, "grad_norm": 0.7776934504508972, "learning_rate": 0.0002, "epoch": 4.270833333333333, "step": 3280}, {"loss": 1.309, "grad_norm": 0.8887693881988525, "learning_rate": 0.0002, "epoch": 4.283854166666667, "step": 3290}, {"loss": 1.3213, "grad_norm": 1.0184714794158936, "learning_rate": 0.0002, "epoch": 4.296875, "step": 3300}, {"loss": 1.3212, "grad_norm": 0.7539387345314026, "learning_rate": 0.0002, "epoch": 4.309895833333333, "step": 3310}, {"loss": 1.3403, "grad_norm": 0.8137491345405579, "learning_rate": 0.0002, "epoch": 4.322916666666667, "step": 3320}, {"loss": 1.3069, "grad_norm": 0.8136276006698608, "learning_rate": 0.0002, "epoch": 4.3359375, "step": 3330}, {"loss": 1.3512, "grad_norm": 0.7880964279174805, "learning_rate": 0.0002, "epoch": 4.348958333333333, "step": 3340}, {"loss": 1.3468, "grad_norm": 0.8654456734657288, "learning_rate": 0.0002, "epoch": 4.361979166666667, "step": 3350}, {"loss": 1.3036, "grad_norm": 0.8093366622924805, "learning_rate": 0.0002, "epoch": 4.375, "step": 3360}, {"loss": 1.3826, "grad_norm": 0.8738575577735901, "learning_rate": 0.0002, "epoch": 4.388020833333333, "step": 3370}, {"loss": 1.3485, "grad_norm": 0.8923026919364929, "learning_rate": 0.0002, "epoch": 4.401041666666667, "step": 3380}, {"loss": 1.3628, "grad_norm": 0.8508910536766052, "learning_rate": 0.0002, "epoch": 4.4140625, "step": 3390}, {"loss": 1.3048, "grad_norm": 0.8262084722518921, "learning_rate": 0.0002, "epoch": 4.427083333333333, "step": 3400}, {"loss": 1.3145, "grad_norm": 0.7843561768531799, "learning_rate": 0.0002, "epoch": 4.440104166666667, "step": 3410}, {"loss": 1.4526, "grad_norm": 0.9087795615196228, "learning_rate": 0.0002, "epoch": 4.453125, "step": 3420}, {"loss": 1.3492, "grad_norm": 0.8278809189796448, "learning_rate": 0.0002, "epoch": 4.466145833333333, "step": 3430}, {"loss": 1.3797, "grad_norm": 0.8337010741233826, "learning_rate": 0.0002, "epoch": 4.479166666666667, "step": 3440}, {"loss": 1.3199, "grad_norm": 0.7790088057518005, "learning_rate": 0.0002, "epoch": 4.4921875, "step": 3450}, {"loss": 1.3344, "grad_norm": 0.826231837272644, "learning_rate": 0.0002, "epoch": 4.505208333333333, "step": 3460}, {"loss": 1.3915, "grad_norm": 0.761461079120636, "learning_rate": 0.0002, "epoch": 4.518229166666667, "step": 3470}, {"loss": 1.2829, "grad_norm": 0.8892785906791687, "learning_rate": 0.0002, "epoch": 4.53125, "step": 3480}, {"loss": 1.3571, "grad_norm": 0.6087225675582886, "learning_rate": 0.0002, "epoch": 4.544270833333333, "step": 3490}, {"loss": 1.3167, "grad_norm": 0.8259274363517761, "learning_rate": 0.0002, "epoch": 4.557291666666667, "step": 3500}, {"loss": 1.3664, "grad_norm": 0.821164071559906, "learning_rate": 0.0002, "epoch": 4.5703125, "step": 3510}, {"loss": 1.2853, "grad_norm": 0.7262887954711914, "learning_rate": 0.0002, "epoch": 4.583333333333333, "step": 3520}, {"loss": 1.3777, "grad_norm": 0.8564826250076294, "learning_rate": 0.0002, "epoch": 4.596354166666667, "step": 3530}, {"loss": 1.3238, "grad_norm": 0.8072929978370667, "learning_rate": 0.0002, "epoch": 4.609375, "step": 3540}, {"loss": 1.43, "grad_norm": 0.8040832877159119, "learning_rate": 0.0002, "epoch": 4.622395833333333, "step": 3550}, {"loss": 1.2863, "grad_norm": 0.7268754839897156, "learning_rate": 0.0002, "epoch": 4.635416666666667, "step": 3560}, {"loss": 1.3485, "grad_norm": 0.9985134601593018, "learning_rate": 0.0002, "epoch": 4.6484375, "step": 3570}, {"loss": 1.3221, "grad_norm": 0.9826098680496216, "learning_rate": 0.0002, "epoch": 4.661458333333333, "step": 3580}, {"loss": 1.2878, "grad_norm": 0.8794422149658203, "learning_rate": 0.0002, "epoch": 4.674479166666667, "step": 3590}, {"loss": 1.3674, "grad_norm": 0.7207489609718323, "learning_rate": 0.0002, "epoch": 4.6875, "step": 3600}, {"loss": 1.3192, "grad_norm": 0.7546059489250183, "learning_rate": 0.0002, "epoch": 4.700520833333333, "step": 3610}, {"loss": 1.3445, "grad_norm": 0.8318526148796082, "learning_rate": 0.0002, "epoch": 4.713541666666667, "step": 3620}, {"loss": 1.3847, "grad_norm": 0.7529309391975403, "learning_rate": 0.0002, "epoch": 4.7265625, "step": 3630}, {"loss": 1.4208, "grad_norm": 0.7762532234191895, "learning_rate": 0.0002, "epoch": 4.739583333333333, "step": 3640}, {"loss": 1.4162, "grad_norm": 0.9306083917617798, "learning_rate": 0.0002, "epoch": 4.752604166666667, "step": 3650}, {"loss": 1.3828, "grad_norm": 0.8050256967544556, "learning_rate": 0.0002, "epoch": 4.765625, "step": 3660}, {"loss": 1.3671, "grad_norm": 0.8114449381828308, "learning_rate": 0.0002, "epoch": 4.778645833333333, "step": 3670}, {"loss": 1.3296, "grad_norm": 0.8125811815261841, "learning_rate": 0.0002, "epoch": 4.791666666666667, "step": 3680}, {"loss": 1.3222, "grad_norm": 0.7642565369606018, "learning_rate": 0.0002, "epoch": 4.8046875, "step": 3690}, {"loss": 1.2842, "grad_norm": 0.8970131874084473, "learning_rate": 0.0002, "epoch": 4.817708333333333, "step": 3700}, {"loss": 1.3983, "grad_norm": 0.7654327154159546, "learning_rate": 0.0002, "epoch": 4.830729166666667, "step": 3710}, {"loss": 1.3746, "grad_norm": 0.7605378031730652, "learning_rate": 0.0002, "epoch": 4.84375, "step": 3720}, {"loss": 1.3149, "grad_norm": 0.8340551257133484, "learning_rate": 0.0002, "epoch": 4.856770833333333, "step": 3730}, {"loss": 1.4309, "grad_norm": 0.7273691296577454, "learning_rate": 0.0002, "epoch": 4.869791666666667, "step": 3740}, {"loss": 1.3094, "grad_norm": 0.9718272686004639, "learning_rate": 0.0002, "epoch": 4.8828125, "step": 3750}, {"loss": 1.296, "grad_norm": 0.7891847491264343, "learning_rate": 0.0002, "epoch": 4.895833333333333, "step": 3760}, {"loss": 1.4613, "grad_norm": 0.9090818166732788, "learning_rate": 0.0002, "epoch": 4.908854166666667, "step": 3770}, {"loss": 1.3478, "grad_norm": 0.7963318824768066, "learning_rate": 0.0002, "epoch": 4.921875, "step": 3780}, {"loss": 1.3558, "grad_norm": 0.7588343620300293, "learning_rate": 0.0002, "epoch": 4.934895833333333, "step": 3790}, {"loss": 1.3664, "grad_norm": 0.84076327085495, "learning_rate": 0.0002, "epoch": 4.947916666666667, "step": 3800}, {"loss": 1.2836, "grad_norm": 0.7767227292060852, "learning_rate": 0.0002, "epoch": 4.9609375, "step": 3810}, {"loss": 1.3925, "grad_norm": 0.8101866245269775, "learning_rate": 0.0002, "epoch": 4.973958333333333, "step": 3820}, {"loss": 1.3881, "grad_norm": 0.7808696627616882, "learning_rate": 0.0002, "epoch": 4.986979166666667, "step": 3830}, {"loss": 1.4475, "grad_norm": 0.9609483480453491, "learning_rate": 0.0002, "epoch": 5.0, "step": 3840}, {"eval_loss": 1.9610719680786133, "eval_runtime": 87.6572, "eval_samples_per_second": 5.875, "eval_steps_per_second": 0.742, "epoch": 5.0, "step": 3840}, {"loss": 1.1603, "grad_norm": 0.9366803765296936, "learning_rate": 0.0002, "epoch": 5.013020833333333, "step": 3850}, {"loss": 1.1931, "grad_norm": 0.8014302849769592, "learning_rate": 0.0002, "epoch": 5.026041666666667, "step": 3860}, {"loss": 1.1418, "grad_norm": 0.977936863899231, "learning_rate": 0.0002, "epoch": 5.0390625, "step": 3870}, {"loss": 1.1258, "grad_norm": 1.045047640800476, "learning_rate": 0.0002, "epoch": 5.052083333333333, "step": 3880}, {"loss": 1.1709, "grad_norm": 1.125620722770691, "learning_rate": 0.0002, "epoch": 5.065104166666667, "step": 3890}, {"loss": 1.1954, "grad_norm": 1.1565124988555908, "learning_rate": 0.0002, "epoch": 5.078125, "step": 3900}, {"loss": 1.1753, "grad_norm": 1.102354884147644, "learning_rate": 0.0002, "epoch": 5.091145833333333, "step": 3910}, {"loss": 1.1632, "grad_norm": 0.9567629098892212, "learning_rate": 0.0002, "epoch": 5.104166666666667, "step": 3920}, {"loss": 1.1875, "grad_norm": 0.9760252833366394, "learning_rate": 0.0002, "epoch": 5.1171875, "step": 3930}, {"loss": 1.2289, "grad_norm": 1.026168704032898, "learning_rate": 0.0002, "epoch": 5.130208333333333, "step": 3940}, {"loss": 1.1598, "grad_norm": 1.1490436792373657, "learning_rate": 0.0002, "epoch": 5.143229166666667, "step": 3950}, {"loss": 1.0823, "grad_norm": 0.9712087512016296, "learning_rate": 0.0002, "epoch": 5.15625, "step": 3960}, {"loss": 1.1948, "grad_norm": 1.0095003843307495, "learning_rate": 0.0002, "epoch": 5.169270833333333, "step": 3970}, {"loss": 1.1617, "grad_norm": 0.9171855449676514, "learning_rate": 0.0002, "epoch": 5.182291666666667, "step": 3980}, {"loss": 1.161, "grad_norm": 1.0105657577514648, "learning_rate": 0.0002, "epoch": 5.1953125, "step": 3990}, {"loss": 1.2098, "grad_norm": 1.0330145359039307, "learning_rate": 0.0002, "epoch": 5.208333333333333, "step": 4000}, {"loss": 1.1965, "grad_norm": 1.0676906108856201, "learning_rate": 0.0002, "epoch": 5.221354166666667, "step": 4010}, {"loss": 1.1392, "grad_norm": 1.055088758468628, "learning_rate": 0.0002, "epoch": 5.234375, "step": 4020}, {"loss": 1.2173, "grad_norm": 0.9523683786392212, "learning_rate": 0.0002, "epoch": 5.247395833333333, "step": 4030}, {"loss": 1.1167, "grad_norm": 0.9013799428939819, "learning_rate": 0.0002, "epoch": 5.260416666666667, "step": 4040}, {"loss": 1.2274, "grad_norm": 0.9379037618637085, "learning_rate": 0.0002, "epoch": 5.2734375, "step": 4050}, {"loss": 1.1246, "grad_norm": 0.9565327763557434, "learning_rate": 0.0002, "epoch": 5.286458333333333, "step": 4060}, {"loss": 1.2103, "grad_norm": 1.1994404792785645, "learning_rate": 0.0002, "epoch": 5.299479166666667, "step": 4070}, {"loss": 1.2016, "grad_norm": 1.0563262701034546, "learning_rate": 0.0002, "epoch": 5.3125, "step": 4080}, {"loss": 1.2478, "grad_norm": 1.024290680885315, "learning_rate": 0.0002, "epoch": 5.325520833333333, "step": 4090}, {"loss": 1.2388, "grad_norm": 1.0022907257080078, "learning_rate": 0.0002, "epoch": 5.338541666666667, "step": 4100}, {"loss": 1.1948, "grad_norm": 0.9642180800437927, "learning_rate": 0.0002, "epoch": 5.3515625, "step": 4110}, {"loss": 1.231, "grad_norm": 1.0228009223937988, "learning_rate": 0.0002, "epoch": 5.364583333333333, "step": 4120}, {"loss": 1.2341, "grad_norm": 1.0379719734191895, "learning_rate": 0.0002, "epoch": 5.377604166666667, "step": 4130}, {"loss": 1.24, "grad_norm": 1.147053599357605, "learning_rate": 0.0002, "epoch": 5.390625, "step": 4140}, {"loss": 1.2026, "grad_norm": 1.2097876071929932, "learning_rate": 0.0002, "epoch": 5.403645833333333, "step": 4150}, {"loss": 1.1978, "grad_norm": 1.0852497816085815, "learning_rate": 0.0002, "epoch": 5.416666666666667, "step": 4160}, {"loss": 1.2182, "grad_norm": 0.9765135645866394, "learning_rate": 0.0002, "epoch": 5.4296875, "step": 4170}, {"loss": 1.3117, "grad_norm": 1.0180606842041016, "learning_rate": 0.0002, "epoch": 5.442708333333333, "step": 4180}, {"loss": 1.2355, "grad_norm": 1.185409665107727, "learning_rate": 0.0002, "epoch": 5.455729166666667, "step": 4190}, {"loss": 1.1531, "grad_norm": 0.9363358020782471, "learning_rate": 0.0002, "epoch": 5.46875, "step": 4200}, {"loss": 1.1645, "grad_norm": 1.0761215686798096, "learning_rate": 0.0002, "epoch": 5.481770833333333, "step": 4210}, {"loss": 1.1465, "grad_norm": 1.057626724243164, "learning_rate": 0.0002, "epoch": 5.494791666666667, "step": 4220}, {"loss": 1.2051, "grad_norm": 1.0103157758712769, "learning_rate": 0.0002, "epoch": 5.5078125, "step": 4230}, {"loss": 1.2193, "grad_norm": 1.1056627035140991, "learning_rate": 0.0002, "epoch": 5.520833333333333, "step": 4240}, {"loss": 1.1941, "grad_norm": 1.0256257057189941, "learning_rate": 0.0002, "epoch": 5.533854166666667, "step": 4250}, {"loss": 1.1724, "grad_norm": 1.2814106941223145, "learning_rate": 0.0002, "epoch": 5.546875, "step": 4260}, {"loss": 1.1676, "grad_norm": 0.9044927954673767, "learning_rate": 0.0002, "epoch": 5.559895833333333, "step": 4270}, {"loss": 1.2448, "grad_norm": 0.9870165586471558, "learning_rate": 0.0002, "epoch": 5.572916666666667, "step": 4280}, {"loss": 1.2414, "grad_norm": 0.9867369532585144, "learning_rate": 0.0002, "epoch": 5.5859375, "step": 4290}, {"loss": 1.2115, "grad_norm": 1.045625925064087, "learning_rate": 0.0002, "epoch": 5.598958333333333, "step": 4300}, {"loss": 1.2786, "grad_norm": 0.979853630065918, "learning_rate": 0.0002, "epoch": 5.611979166666667, "step": 4310}, {"loss": 1.1629, "grad_norm": 1.029212236404419, "learning_rate": 0.0002, "epoch": 5.625, "step": 4320}, {"loss": 1.1985, "grad_norm": 1.0348633527755737, "learning_rate": 0.0002, "epoch": 5.638020833333333, "step": 4330}, {"loss": 1.1914, "grad_norm": 1.0055185556411743, "learning_rate": 0.0002, "epoch": 5.651041666666667, "step": 4340}, {"loss": 1.2658, "grad_norm": 0.9312447309494019, "learning_rate": 0.0002, "epoch": 5.6640625, "step": 4350}, {"loss": 1.1901, "grad_norm": 1.1411694288253784, "learning_rate": 0.0002, "epoch": 5.677083333333333, "step": 4360}, {"loss": 1.2679, "grad_norm": 0.9764434695243835, "learning_rate": 0.0002, "epoch": 5.690104166666667, "step": 4370}, {"loss": 1.2215, "grad_norm": 1.079154133796692, "learning_rate": 0.0002, "epoch": 5.703125, "step": 4380}, {"loss": 1.1659, "grad_norm": 0.999526858329773, "learning_rate": 0.0002, "epoch": 5.716145833333333, "step": 4390}, {"loss": 1.1685, "grad_norm": 1.1239734888076782, "learning_rate": 0.0002, "epoch": 5.729166666666667, "step": 4400}, {"loss": 1.1126, "grad_norm": 1.0539512634277344, "learning_rate": 0.0002, "epoch": 5.7421875, "step": 4410}, {"loss": 1.1413, "grad_norm": 0.9884052872657776, "learning_rate": 0.0002, "epoch": 5.755208333333333, "step": 4420}, {"loss": 1.1781, "grad_norm": 0.9821958541870117, "learning_rate": 0.0002, "epoch": 5.768229166666667, "step": 4430}, {"loss": 1.2319, "grad_norm": 0.9340839982032776, "learning_rate": 0.0002, "epoch": 5.78125, "step": 4440}, {"loss": 1.3085, "grad_norm": 0.9935781955718994, "learning_rate": 0.0002, "epoch": 5.794270833333333, "step": 4450}, {"loss": 1.1726, "grad_norm": 1.1027121543884277, "learning_rate": 0.0002, "epoch": 5.807291666666667, "step": 4460}, {"loss": 1.2385, "grad_norm": 0.9388337135314941, "learning_rate": 0.0002, "epoch": 5.8203125, "step": 4470}, {"loss": 1.259, "grad_norm": 1.0957310199737549, "learning_rate": 0.0002, "epoch": 5.833333333333333, "step": 4480}, {"loss": 1.3017, "grad_norm": 1.0832754373550415, "learning_rate": 0.0002, "epoch": 5.846354166666667, "step": 4490}, {"loss": 1.1724, "grad_norm": 0.9498379826545715, "learning_rate": 0.0002, "epoch": 5.859375, "step": 4500}, {"loss": 1.2312, "grad_norm": 0.9104725122451782, "learning_rate": 0.0002, "epoch": 5.872395833333333, "step": 4510}, {"loss": 1.204, "grad_norm": 1.2238177061080933, "learning_rate": 0.0002, "epoch": 5.885416666666667, "step": 4520}, {"loss": 1.2163, "grad_norm": 1.0549527406692505, "learning_rate": 0.0002, "epoch": 5.8984375, "step": 4530}, {"loss": 1.3086, "grad_norm": 1.0415066480636597, "learning_rate": 0.0002, "epoch": 5.911458333333333, "step": 4540}, {"loss": 1.1744, "grad_norm": 0.9098646640777588, "learning_rate": 0.0002, "epoch": 5.924479166666667, "step": 4550}, {"loss": 1.2126, "grad_norm": 0.9182857275009155, "learning_rate": 0.0002, "epoch": 5.9375, "step": 4560}, {"loss": 1.2341, "grad_norm": 1.088038444519043, "learning_rate": 0.0002, "epoch": 5.950520833333333, "step": 4570}, {"loss": 1.2317, "grad_norm": 1.1331020593643188, "learning_rate": 0.0002, "epoch": 5.963541666666667, "step": 4580}, {"loss": 1.2318, "grad_norm": 0.9592235088348389, "learning_rate": 0.0002, "epoch": 5.9765625, "step": 4590}, {"loss": 1.1995, "grad_norm": 1.0126368999481201, "learning_rate": 0.0002, "epoch": 5.989583333333333, "step": 4600}, {"eval_loss": 2.096651315689087, "eval_runtime": 43.1936, "eval_samples_per_second": 11.923, "eval_steps_per_second": 1.505, "epoch": 6.0, "step": 4608}, {"loss": 1.2061, "grad_norm": 1.0549334287643433, "learning_rate": 0.0002, "epoch": 6.002604166666667, "step": 4610}, {"loss": 1.0046, "grad_norm": 1.099247694015503, "learning_rate": 0.0002, "epoch": 6.015625, "step": 4620}, {"loss": 1.0542, "grad_norm": 1.0992592573165894, "learning_rate": 0.0002, "epoch": 6.028645833333333, "step": 4630}, {"loss": 1.0032, "grad_norm": 1.139350414276123, "learning_rate": 0.0002, "epoch": 6.041666666666667, "step": 4640}, {"loss": 1.0105, "grad_norm": 1.1316219568252563, "learning_rate": 0.0002, "epoch": 6.0546875, "step": 4650}, {"loss": 1.05, "grad_norm": 1.5254799127578735, "learning_rate": 0.0002, "epoch": 6.067708333333333, "step": 4660}, {"loss": 1.0357, "grad_norm": 1.155513048171997, "learning_rate": 0.0002, "epoch": 6.080729166666667, "step": 4670}, {"loss": 1.0782, "grad_norm": 1.311339259147644, "learning_rate": 0.0002, "epoch": 6.09375, "step": 4680}, {"loss": 1.098, "grad_norm": 0.9942600131034851, "learning_rate": 0.0002, "epoch": 6.106770833333333, "step": 4690}, {"loss": 0.9989, "grad_norm": 1.388214111328125, "learning_rate": 0.0002, "epoch": 6.119791666666667, "step": 4700}, {"loss": 1.0893, "grad_norm": 1.260488510131836, "learning_rate": 0.0002, "epoch": 6.1328125, "step": 4710}, {"loss": 1.0225, "grad_norm": 1.231615662574768, "learning_rate": 0.0002, "epoch": 6.145833333333333, "step": 4720}, {"loss": 1.0547, "grad_norm": 1.049696922302246, "learning_rate": 0.0002, "epoch": 6.158854166666667, "step": 4730}, {"loss": 1.0089, "grad_norm": 1.145426869392395, "learning_rate": 0.0002, "epoch": 6.171875, "step": 4740}, {"loss": 1.0751, "grad_norm": 1.1715868711471558, "learning_rate": 0.0002, "epoch": 6.184895833333333, "step": 4750}, {"loss": 0.9901, "grad_norm": 1.2575212717056274, "learning_rate": 0.0002, "epoch": 6.197916666666667, "step": 4760}, {"loss": 0.9775, "grad_norm": 1.2996530532836914, "learning_rate": 0.0002, "epoch": 6.2109375, "step": 4770}, {"loss": 1.0227, "grad_norm": 1.4030718803405762, "learning_rate": 0.0002, "epoch": 6.223958333333333, "step": 4780}, {"loss": 1.0439, "grad_norm": 1.2140913009643555, "learning_rate": 0.0002, "epoch": 6.236979166666667, "step": 4790}, {"loss": 1.0637, "grad_norm": 1.3512893915176392, "learning_rate": 0.0002, "epoch": 6.25, "step": 4800}, {"loss": 1.0367, "grad_norm": 1.1931439638137817, "learning_rate": 0.0002, "epoch": 6.263020833333333, "step": 4810}, {"loss": 1.0615, "grad_norm": 1.0379345417022705, "learning_rate": 0.0002, "epoch": 6.276041666666667, "step": 4820}, {"loss": 1.0954, "grad_norm": 1.1571568250656128, "learning_rate": 0.0002, "epoch": 6.2890625, "step": 4830}, {"loss": 1.0029, "grad_norm": 1.0717264413833618, "learning_rate": 0.0002, "epoch": 6.302083333333333, "step": 4840}, {"loss": 1.0466, "grad_norm": 1.360496997833252, "learning_rate": 0.0002, "epoch": 6.315104166666667, "step": 4850}, {"loss": 1.001, "grad_norm": 1.0864052772521973, "learning_rate": 0.0002, "epoch": 6.328125, "step": 4860}, {"loss": 1.0229, "grad_norm": 1.3391871452331543, "learning_rate": 0.0002, "epoch": 6.341145833333333, "step": 4870}, {"loss": 1.0797, "grad_norm": 1.2568541765213013, "learning_rate": 0.0002, "epoch": 6.354166666666667, "step": 4880}, {"loss": 1.1076, "grad_norm": 1.255483627319336, "learning_rate": 0.0002, "epoch": 6.3671875, "step": 4890}, {"loss": 1.0244, "grad_norm": 1.173972487449646, "learning_rate": 0.0002, "epoch": 6.380208333333333, "step": 4900}, {"loss": 1.0238, "grad_norm": 1.14010488986969, "learning_rate": 0.0002, "epoch": 6.393229166666667, "step": 4910}, {"loss": 1.0319, "grad_norm": 1.1317493915557861, "learning_rate": 0.0002, "epoch": 6.40625, "step": 4920}, {"loss": 1.0195, "grad_norm": 1.1547486782073975, "learning_rate": 0.0002, "epoch": 6.419270833333333, "step": 4930}, {"loss": 1.0456, "grad_norm": 1.1822998523712158, "learning_rate": 0.0002, "epoch": 6.432291666666667, "step": 4940}, {"loss": 1.0535, "grad_norm": 1.1865756511688232, "learning_rate": 0.0002, "epoch": 6.4453125, "step": 4950}, {"loss": 1.0255, "grad_norm": 1.13661789894104, "learning_rate": 0.0002, "epoch": 6.458333333333333, "step": 4960}, {"loss": 1.0771, "grad_norm": 1.047326683998108, "learning_rate": 0.0002, "epoch": 6.471354166666667, "step": 4970}, {"loss": 1.0965, "grad_norm": 1.3550827503204346, "learning_rate": 0.0002, "epoch": 6.484375, "step": 4980}, {"loss": 1.0984, "grad_norm": 1.2868435382843018, "learning_rate": 0.0002, "epoch": 6.497395833333333, "step": 4990}, {"loss": 1.1046, "grad_norm": 1.4678666591644287, "learning_rate": 0.0002, "epoch": 6.510416666666667, "step": 5000}, {"loss": 1.076, "grad_norm": 1.3739159107208252, "learning_rate": 0.0002, "epoch": 6.5234375, "step": 5010}, {"loss": 1.046, "grad_norm": 1.213034987449646, "learning_rate": 0.0002, "epoch": 6.536458333333333, "step": 5020}, {"loss": 1.1129, "grad_norm": 1.5025049448013306, "learning_rate": 0.0002, "epoch": 6.549479166666667, "step": 5030}, {"loss": 1.0564, "grad_norm": 1.1811821460723877, "learning_rate": 0.0002, "epoch": 6.5625, "step": 5040}, {"loss": 1.1096, "grad_norm": 1.2845960855484009, "learning_rate": 0.0002, "epoch": 6.575520833333333, "step": 5050}, {"loss": 1.0274, "grad_norm": 1.0641103982925415, "learning_rate": 0.0002, "epoch": 6.588541666666667, "step": 5060}, {"loss": 1.0559, "grad_norm": 1.0967134237289429, "learning_rate": 0.0002, "epoch": 6.6015625, "step": 5070}, {"loss": 1.0965, "grad_norm": 1.1802116632461548, "learning_rate": 0.0002, "epoch": 6.614583333333333, "step": 5080}, {"loss": 1.0296, "grad_norm": 1.3110308647155762, "learning_rate": 0.0002, "epoch": 6.627604166666667, "step": 5090}, {"loss": 1.0273, "grad_norm": 1.1863301992416382, "learning_rate": 0.0002, "epoch": 6.640625, "step": 5100}, {"loss": 1.1355, "grad_norm": 1.0931109189987183, "learning_rate": 0.0002, "epoch": 6.653645833333333, "step": 5110}, {"loss": 1.1025, "grad_norm": 1.0571614503860474, "learning_rate": 0.0002, "epoch": 6.666666666666667, "step": 5120}, {"loss": 1.1292, "grad_norm": 1.2855656147003174, "learning_rate": 0.0002, "epoch": 6.6796875, "step": 5130}, {"loss": 1.0582, "grad_norm": 1.2217806577682495, "learning_rate": 0.0002, "epoch": 6.692708333333333, "step": 5140}, {"loss": 1.1098, "grad_norm": 1.093658447265625, "learning_rate": 0.0002, "epoch": 6.705729166666667, "step": 5150}, {"loss": 1.0845, "grad_norm": 1.2592076063156128, "learning_rate": 0.0002, "epoch": 6.71875, "step": 5160}, {"loss": 1.0381, "grad_norm": 1.0720105171203613, "learning_rate": 0.0002, "epoch": 6.731770833333333, "step": 5170}, {"loss": 1.0707, "grad_norm": 1.178058385848999, "learning_rate": 0.0002, "epoch": 6.744791666666667, "step": 5180}, {"loss": 1.116, "grad_norm": 1.1897447109222412, "learning_rate": 0.0002, "epoch": 6.7578125, "step": 5190}, {"loss": 1.064, "grad_norm": 1.3547686338424683, "learning_rate": 0.0002, "epoch": 6.770833333333333, "step": 5200}, {"loss": 1.0642, "grad_norm": 1.2514727115631104, "learning_rate": 0.0002, "epoch": 6.783854166666667, "step": 5210}, {"loss": 1.0898, "grad_norm": 1.5253846645355225, "learning_rate": 0.0002, "epoch": 6.796875, "step": 5220}, {"loss": 1.0426, "grad_norm": 1.090774655342102, "learning_rate": 0.0002, "epoch": 6.809895833333333, "step": 5230}, {"loss": 1.0867, "grad_norm": 1.1387991905212402, "learning_rate": 0.0002, "epoch": 6.822916666666667, "step": 5240}, {"loss": 1.0493, "grad_norm": 1.102423906326294, "learning_rate": 0.0002, "epoch": 6.8359375, "step": 5250}, {"loss": 1.0976, "grad_norm": 1.2453415393829346, "learning_rate": 0.0002, "epoch": 6.848958333333333, "step": 5260}, {"loss": 1.1046, "grad_norm": 1.2541141510009766, "learning_rate": 0.0002, "epoch": 6.861979166666667, "step": 5270}, {"loss": 1.0816, "grad_norm": 1.2719744443893433, "learning_rate": 0.0002, "epoch": 6.875, "step": 5280}, {"loss": 1.0399, "grad_norm": 1.085763931274414, "learning_rate": 0.0002, "epoch": 6.888020833333333, "step": 5290}, {"loss": 1.1306, "grad_norm": 1.2399879693984985, "learning_rate": 0.0002, "epoch": 6.901041666666667, "step": 5300}, {"loss": 1.1178, "grad_norm": 1.244888424873352, "learning_rate": 0.0002, "epoch": 6.9140625, "step": 5310}, {"loss": 1.0868, "grad_norm": 1.1424126625061035, "learning_rate": 0.0002, "epoch": 6.927083333333333, "step": 5320}, {"loss": 1.0768, "grad_norm": 1.1804956197738647, "learning_rate": 0.0002, "epoch": 6.940104166666667, "step": 5330}, {"loss": 1.0803, "grad_norm": 1.3943406343460083, "learning_rate": 0.0002, "epoch": 6.953125, "step": 5340}, {"loss": 1.0573, "grad_norm": 1.3278584480285645, "learning_rate": 0.0002, "epoch": 6.966145833333333, "step": 5350}, {"loss": 1.1008, "grad_norm": 1.3579362630844116, "learning_rate": 0.0002, "epoch": 6.979166666666667, "step": 5360}, {"loss": 1.059, "grad_norm": 1.2172175645828247, "learning_rate": 0.0002, "epoch": 6.9921875, "step": 5370}, {"eval_loss": 2.200756549835205, "eval_runtime": 42.8258, "eval_samples_per_second": 12.025, "eval_steps_per_second": 1.518, "epoch": 7.0, "step": 5376}, {"loss": 1.0272, "grad_norm": 1.175237774848938, "learning_rate": 0.0002, "epoch": 7.005208333333333, "step": 5380}, {"loss": 0.8889, "grad_norm": 1.3215409517288208, "learning_rate": 0.0002, "epoch": 7.018229166666667, "step": 5390}, {"loss": 0.8849, "grad_norm": 1.5751091241836548, "learning_rate": 0.0002, "epoch": 7.03125, "step": 5400}, {"loss": 0.8786, "grad_norm": 1.390234351158142, "learning_rate": 0.0002, "epoch": 7.044270833333333, "step": 5410}, {"loss": 0.8735, "grad_norm": 1.3558553457260132, "learning_rate": 0.0002, "epoch": 7.057291666666667, "step": 5420}, {"loss": 0.8951, "grad_norm": 1.4664019346237183, "learning_rate": 0.0002, "epoch": 7.0703125, "step": 5430}, {"loss": 0.8829, "grad_norm": 1.5194770097732544, "learning_rate": 0.0002, "epoch": 7.083333333333333, "step": 5440}, {"loss": 0.8322, "grad_norm": 1.2315709590911865, "learning_rate": 0.0002, "epoch": 7.096354166666667, "step": 5450}, {"loss": 0.847, "grad_norm": 1.4849501848220825, "learning_rate": 0.0002, "epoch": 7.109375, "step": 5460}, {"loss": 0.8467, "grad_norm": 1.471713662147522, "learning_rate": 0.0002, "epoch": 7.122395833333333, "step": 5470}, {"loss": 0.8852, "grad_norm": 1.5665255784988403, "learning_rate": 0.0002, "epoch": 7.135416666666667, "step": 5480}, {"loss": 0.8499, "grad_norm": 1.796554446220398, "learning_rate": 0.0002, "epoch": 7.1484375, "step": 5490}, {"loss": 0.9099, "grad_norm": 1.3455413579940796, "learning_rate": 0.0002, "epoch": 7.161458333333333, "step": 5500}, {"loss": 0.8884, "grad_norm": 1.5465866327285767, "learning_rate": 0.0002, "epoch": 7.174479166666667, "step": 5510}, {"loss": 0.8476, "grad_norm": 1.42877197265625, "learning_rate": 0.0002, "epoch": 7.1875, "step": 5520}, {"loss": 0.8829, "grad_norm": 1.5602610111236572, "learning_rate": 0.0002, "epoch": 7.200520833333333, "step": 5530}, {"loss": 0.8745, "grad_norm": 1.3813670873641968, "learning_rate": 0.0002, "epoch": 7.213541666666667, "step": 5540}, {"loss": 0.8852, "grad_norm": 1.2724273204803467, "learning_rate": 0.0002, "epoch": 7.2265625, "step": 5550}, {"loss": 0.8677, "grad_norm": 1.3184880018234253, "learning_rate": 0.0002, "epoch": 7.239583333333333, "step": 5560}, {"loss": 0.9141, "grad_norm": 1.3192334175109863, "learning_rate": 0.0002, "epoch": 7.252604166666667, "step": 5570}, {"loss": 0.9027, "grad_norm": 1.6324747800827026, "learning_rate": 0.0002, "epoch": 7.265625, "step": 5580}, {"loss": 0.9024, "grad_norm": 1.5259788036346436, "learning_rate": 0.0002, "epoch": 7.278645833333333, "step": 5590}, {"loss": 0.9007, "grad_norm": 1.397698163986206, "learning_rate": 0.0002, "epoch": 7.291666666666667, "step": 5600}, {"loss": 0.9571, "grad_norm": 1.5079587697982788, "learning_rate": 0.0002, "epoch": 7.3046875, "step": 5610}, {"loss": 0.9091, "grad_norm": 1.3299126625061035, "learning_rate": 0.0002, "epoch": 7.317708333333333, "step": 5620}, {"loss": 0.903, "grad_norm": 1.4264276027679443, "learning_rate": 0.0002, "epoch": 7.330729166666667, "step": 5630}, {"loss": 0.8602, "grad_norm": 1.3027597665786743, "learning_rate": 0.0002, "epoch": 7.34375, "step": 5640}, {"loss": 0.9054, "grad_norm": 1.3218268156051636, "learning_rate": 0.0002, "epoch": 7.356770833333333, "step": 5650}, {"loss": 0.986, "grad_norm": 1.346595048904419, "learning_rate": 0.0002, "epoch": 7.369791666666667, "step": 5660}, {"loss": 0.983, "grad_norm": 1.5836858749389648, "learning_rate": 0.0002, "epoch": 7.3828125, "step": 5670}, {"loss": 0.8698, "grad_norm": 1.586815595626831, "learning_rate": 0.0002, "epoch": 7.395833333333333, "step": 5680}, {"loss": 0.9524, "grad_norm": 1.8045003414154053, "learning_rate": 0.0002, "epoch": 7.408854166666667, "step": 5690}, {"loss": 0.9479, "grad_norm": 1.5418108701705933, "learning_rate": 0.0002, "epoch": 7.421875, "step": 5700}, {"loss": 0.9209, "grad_norm": 1.426693081855774, "learning_rate": 0.0002, "epoch": 7.434895833333333, "step": 5710}, {"loss": 0.8707, "grad_norm": 1.3240386247634888, "learning_rate": 0.0002, "epoch": 7.447916666666667, "step": 5720}, {"loss": 0.9101, "grad_norm": 1.26353919506073, "learning_rate": 0.0002, "epoch": 7.4609375, "step": 5730}, {"loss": 0.9342, "grad_norm": 1.5816353559494019, "learning_rate": 0.0002, "epoch": 7.473958333333333, "step": 5740}, {"loss": 0.9493, "grad_norm": 1.133431315422058, "learning_rate": 0.0002, "epoch": 7.486979166666667, "step": 5750}, {"loss": 0.8716, "grad_norm": 1.3449418544769287, "learning_rate": 0.0002, "epoch": 7.5, "step": 5760}, {"loss": 0.9176, "grad_norm": 1.288068175315857, "learning_rate": 0.0002, "epoch": 7.513020833333333, "step": 5770}, {"loss": 0.9546, "grad_norm": 1.4578267335891724, "learning_rate": 0.0002, "epoch": 7.526041666666667, "step": 5780}, {"loss": 0.9849, "grad_norm": 1.423254370689392, "learning_rate": 0.0002, "epoch": 7.5390625, "step": 5790}, {"loss": 0.9747, "grad_norm": 1.2016581296920776, "learning_rate": 0.0002, "epoch": 7.552083333333333, "step": 5800}, {"loss": 0.8995, "grad_norm": 1.7114553451538086, "learning_rate": 0.0002, "epoch": 7.565104166666667, "step": 5810}, {"loss": 0.9398, "grad_norm": 1.5403549671173096, "learning_rate": 0.0002, "epoch": 7.578125, "step": 5820}, {"loss": 0.9186, "grad_norm": 1.324479341506958, "learning_rate": 0.0002, "epoch": 7.591145833333333, "step": 5830}, {"loss": 0.9007, "grad_norm": 1.4195842742919922, "learning_rate": 0.0002, "epoch": 7.604166666666667, "step": 5840}, {"loss": 0.9223, "grad_norm": 1.2824413776397705, "learning_rate": 0.0002, "epoch": 7.6171875, "step": 5850}, {"loss": 0.9674, "grad_norm": 1.4113891124725342, "learning_rate": 0.0002, "epoch": 7.630208333333333, "step": 5860}, {"loss": 0.969, "grad_norm": 1.425513744354248, "learning_rate": 0.0002, "epoch": 7.643229166666667, "step": 5870}, {"loss": 0.9201, "grad_norm": 1.369148850440979, "learning_rate": 0.0002, "epoch": 7.65625, "step": 5880}, {"loss": 0.9857, "grad_norm": 1.2715039253234863, "learning_rate": 0.0002, "epoch": 7.669270833333333, "step": 5890}, {"loss": 0.9278, "grad_norm": 1.5072753429412842, "learning_rate": 0.0002, "epoch": 7.682291666666667, "step": 5900}, {"loss": 0.9552, "grad_norm": 1.2748578786849976, "learning_rate": 0.0002, "epoch": 7.6953125, "step": 5910}, {"loss": 0.9677, "grad_norm": 1.4645745754241943, "learning_rate": 0.0002, "epoch": 7.708333333333333, "step": 5920}, {"loss": 0.9259, "grad_norm": 1.410602331161499, "learning_rate": 0.0002, "epoch": 7.721354166666667, "step": 5930}, {"loss": 0.9688, "grad_norm": 1.4340840578079224, "learning_rate": 0.0002, "epoch": 7.734375, "step": 5940}, {"loss": 0.9063, "grad_norm": 1.4908568859100342, "learning_rate": 0.0002, "epoch": 7.747395833333333, "step": 5950}, {"loss": 1.0224, "grad_norm": 1.6938505172729492, "learning_rate": 0.0002, "epoch": 7.760416666666667, "step": 5960}, {"loss": 0.9543, "grad_norm": 1.5617954730987549, "learning_rate": 0.0002, "epoch": 7.7734375, "step": 5970}, {"loss": 0.9556, "grad_norm": 1.4071742296218872, "learning_rate": 0.0002, "epoch": 7.786458333333333, "step": 5980}, {"loss": 0.9493, "grad_norm": 1.517405390739441, "learning_rate": 0.0002, "epoch": 7.799479166666667, "step": 5990}, {"loss": 0.9541, "grad_norm": 1.4399837255477905, "learning_rate": 0.0002, "epoch": 7.8125, "step": 6000}, {"loss": 1.0497, "grad_norm": 1.4359688758850098, "learning_rate": 0.0002, "epoch": 7.825520833333333, "step": 6010}, {"loss": 0.9701, "grad_norm": 1.4709250926971436, "learning_rate": 0.0002, "epoch": 7.838541666666667, "step": 6020}, {"loss": 0.9785, "grad_norm": 1.185585379600525, "learning_rate": 0.0002, "epoch": 7.8515625, "step": 6030}, {"loss": 0.9425, "grad_norm": 1.3034945726394653, "learning_rate": 0.0002, "epoch": 7.864583333333333, "step": 6040}, {"loss": 0.9651, "grad_norm": 1.609330654144287, "learning_rate": 0.0002, "epoch": 7.877604166666667, "step": 6050}, {"loss": 0.99, "grad_norm": 1.416290521621704, "learning_rate": 0.0002, "epoch": 7.890625, "step": 6060}, {"loss": 0.9711, "grad_norm": 1.58739173412323, "learning_rate": 0.0002, "epoch": 7.903645833333333, "step": 6070}, {"loss": 0.9317, "grad_norm": 1.2414129972457886, "learning_rate": 0.0002, "epoch": 7.916666666666667, "step": 6080}, {"loss": 1.0034, "grad_norm": 1.7573872804641724, "learning_rate": 0.0002, "epoch": 7.9296875, "step": 6090}, {"loss": 0.9231, "grad_norm": 1.514016032218933, "learning_rate": 0.0002, "epoch": 7.942708333333333, "step": 6100}, {"loss": 0.9818, "grad_norm": 1.292657494544983, "learning_rate": 0.0002, "epoch": 7.955729166666667, "step": 6110}, {"loss": 0.9517, "grad_norm": 1.4027271270751953, "learning_rate": 0.0002, "epoch": 7.96875, "step": 6120}, {"loss": 0.9159, "grad_norm": 1.4184486865997314, "learning_rate": 0.0002, "epoch": 7.981770833333333, "step": 6130}, {"loss": 0.9724, "grad_norm": 1.5634310245513916, "learning_rate": 0.0002, "epoch": 7.994791666666667, "step": 6140}]}