diff --git a/.gitattributes b/.gitattributes index 4fe9f37bc1a9e6bc4bec71a976be6d6d8d4ca6aa..193928056fd95a57e3ea7dbec7e803133861d776 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1899,3 +1899,12 @@ Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_ Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-6088/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-761/tokenizer.json filter=lfs diff=lfs merge=lfs -text Meta-Llama-3-8B-Instruct_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6c78123e317d30039c08d0005af0defa5794e80a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae7e84b9b9cce27f52e927924f699d86baf6a8eae0d4b8fe15079bce56f40a67 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6c78123e317d30039c08d0005af0defa5794e80a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae7e84b9b9cce27f52e927924f699d86baf6a8eae0d4b8fe15079bce56f40a67 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3df5735ef2e554b840c0244a6d8a6a8d13dd85d2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa8c26633dbd05b80981f92db9993dbf7ff098d30a652dfdf41dc670cdfc578a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8536f52f1a677a9d308481ba30f39a8b1643e798 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1af6baebc3f8ff0abc08c49ffd79a5654df27b19747f9f7445476bac964abe98 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..574851d73f1f423bc7fd3b74ef9bc88c905f3754 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c77dbd02d51b31cf00f6638f73d175ce25655ac869b78c1454b4a1c11a6fdae +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0118efd747711c884c415854f37a9b4b894a529a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/trainer_state.json @@ -0,0 +1,1092 @@ +{ + "best_metric": 1.8150336742401123, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1492, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.2998153269290924, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 750 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.34353747963905334, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 760 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.3506847321987152, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 770 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.3434218764305115, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 780 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.39283573627471924, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 790 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.36534103751182556, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 800 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.32713210582733154, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 810 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.4298870861530304, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 820 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.3652895987033844, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 830 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.4341593086719513, + "learning_rate": 0.0002, + "loss": 1.7952, + "step": 840 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.3925093412399292, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 850 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 860 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.36138468980789185, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 870 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.33074072003364563, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 880 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.3552579879760742, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 890 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.38744238018989563, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 900 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.3563305735588074, + "learning_rate": 0.0002, + "loss": 1.7543, + "step": 910 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.35686084628105164, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 920 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.4001927077770233, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 930 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.35909149050712585, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 940 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.35123375058174133, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 950 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.38013333082199097, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 960 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 970 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.4208183288574219, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 980 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.3613564074039459, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 990 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.34058499336242676, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1000 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.3563075065612793, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1010 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.36920854449272156, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 1020 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.3889519274234772, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1030 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.3664555251598358, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1040 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.38175567984580994, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1050 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.42346763610839844, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1060 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.3456033170223236, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1070 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.38931941986083984, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1080 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.5473279356956482, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1090 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.3517422676086426, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 1100 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.3511943221092224, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1110 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.3762837052345276, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 1120 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.37149128317832947, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1130 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.3945842981338501, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 1140 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.40258195996284485, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1150 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.3959120213985443, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 1160 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.37792712450027466, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 1170 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.4019201099872589, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1180 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.40712273120880127, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1190 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.4131423234939575, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 1200 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.3738194704055786, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1210 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.3987765908241272, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.34117406606674194, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1230 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.34900516271591187, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 1240 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.35759788751602173, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 1250 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.3837822377681732, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1260 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.3671180307865143, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1270 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.4124658703804016, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 1280 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.39059901237487793, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 1290 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.4006287157535553, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1300 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.3606216013431549, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 1310 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.3861924111843109, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1320 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.41432589292526245, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1330 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.3751705586910248, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 1340 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.36217355728149414, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1350 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.35937434434890747, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1360 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.36120304465293884, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 1370 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.36082401871681213, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 1380 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.3616413176059723, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 1390 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.3664911091327667, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1400 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.3545122444629669, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1410 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.38186976313591003, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1420 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.41099944710731506, + "learning_rate": 0.0002, + "loss": 1.788, + "step": 1430 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.34538620710372925, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1440 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.35443663597106934, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 1450 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.4783519208431244, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1460 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.36285310983657837, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1470 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.361730694770813, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 1480 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.38347867131233215, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8150336742401123, + "eval_runtime": 91.1797, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 0.713, + "step": 1492 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.90464192766935e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8df34da3108954e60d591342e3325b82eac31df1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca7526c58fb50187daccf0e241c391ce6df8498282bd1e010e048249602ab490 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b9bc1b34bc702b3aa0b0d0269f9c86a2438958d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:addbcba4f1964091d69b5dd4d27a4047b82e4d34db0835bbbc88f7128944d6ae +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0fcb29d1313e89c3eb2e9ab3642b31b81cac79b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342c23fb7355ac380456caa6a415a83d278f7d61aa98c24b98832cafe8de1231 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b0202aed3c8ae724d9ab979bc01afd80597a3b5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd3195accd1e0d03d2e89a933f57bc4aad032c6e04c703fc1f4518ed41fd352e +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..975aaa7a08d29328cbd9013b644883b1f9263120 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/trainer_state.json @@ -0,0 +1,1618 @@ +{ + "best_metric": 1.8150336742401123, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 2238, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.2998153269290924, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 750 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.34353747963905334, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 760 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.3506847321987152, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 770 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.3434218764305115, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 780 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.39283573627471924, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 790 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.36534103751182556, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 800 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.32713210582733154, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 810 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.4298870861530304, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 820 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.3652895987033844, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 830 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.4341593086719513, + "learning_rate": 0.0002, + "loss": 1.7952, + "step": 840 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.3925093412399292, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 850 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 860 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.36138468980789185, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 870 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.33074072003364563, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 880 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.3552579879760742, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 890 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.38744238018989563, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 900 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.3563305735588074, + "learning_rate": 0.0002, + "loss": 1.7543, + "step": 910 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.35686084628105164, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 920 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.4001927077770233, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 930 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.35909149050712585, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 940 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.35123375058174133, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 950 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.38013333082199097, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 960 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 970 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.4208183288574219, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 980 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.3613564074039459, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 990 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.34058499336242676, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1000 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.3563075065612793, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1010 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.36920854449272156, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 1020 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.3889519274234772, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1030 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.3664555251598358, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1040 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.38175567984580994, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1050 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.42346763610839844, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1060 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.3456033170223236, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1070 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.38931941986083984, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1080 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.5473279356956482, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1090 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.3517422676086426, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 1100 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.3511943221092224, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1110 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.3762837052345276, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 1120 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.37149128317832947, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1130 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.3945842981338501, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 1140 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.40258195996284485, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1150 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.3959120213985443, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 1160 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.37792712450027466, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 1170 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.4019201099872589, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1180 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.40712273120880127, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1190 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.4131423234939575, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 1200 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.3738194704055786, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1210 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.3987765908241272, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.34117406606674194, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1230 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.34900516271591187, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 1240 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.35759788751602173, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 1250 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.3837822377681732, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1260 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.3671180307865143, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1270 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.4124658703804016, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 1280 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.39059901237487793, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 1290 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.4006287157535553, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1300 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.3606216013431549, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 1310 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.3861924111843109, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1320 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.41432589292526245, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1330 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.3751705586910248, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 1340 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.36217355728149414, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1350 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.35937434434890747, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1360 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.36120304465293884, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 1370 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.36082401871681213, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 1380 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.3616413176059723, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 1390 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.3664911091327667, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1400 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.3545122444629669, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1410 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.38186976313591003, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1420 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.41099944710731506, + "learning_rate": 0.0002, + "loss": 1.788, + "step": 1430 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.34538620710372925, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1440 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.35443663597106934, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 1450 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.4783519208431244, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1460 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.36285310983657837, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1470 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.361730694770813, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 1480 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.38347867131233215, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8150336742401123, + "eval_runtime": 91.1797, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 0.713, + "step": 1492 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 0.3648935854434967, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1500 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 0.3521469533443451, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 1510 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 0.4275520145893097, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 1520 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 0.4140888750553131, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1530 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 0.37715452909469604, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 1540 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 0.4375513195991516, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 1550 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 0.44963088631629944, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1560 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 0.45463916659355164, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 1570 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 0.3952806293964386, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1580 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 0.44873616099357605, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1590 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 0.45529067516326904, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1600 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 0.4483625590801239, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1610 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 0.3954690992832184, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1620 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 0.4297006130218506, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 1630 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 0.4121869206428528, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 1640 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 0.45843517780303955, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1650 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 0.44742295145988464, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1660 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 0.500198483467102, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 1670 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 0.4322265386581421, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 1680 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 0.480289101600647, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1690 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 0.4532500207424164, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 1700 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 0.41848474740982056, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1710 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 0.47211962938308716, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 1720 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 0.4273032248020172, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1730 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 0.4660373330116272, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1740 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 0.4409862756729126, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 1750 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 0.44795849919319153, + "learning_rate": 0.0002, + "loss": 1.6579, + "step": 1760 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 0.4470100402832031, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 1770 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 0.4184521436691284, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1780 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 0.4572308659553528, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1790 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 0.4888782501220703, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 1800 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 0.4442083239555359, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1810 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 0.4986329972743988, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1820 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 0.47918054461479187, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1830 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 0.42569679021835327, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1840 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 0.4683821201324463, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 1850 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 0.43605074286460876, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 1860 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 0.4189167618751526, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 1870 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 0.5860861539840698, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1880 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 0.4568740427494049, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1890 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 0.4672846496105194, + "learning_rate": 0.0002, + "loss": 1.6653, + "step": 1900 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 0.4280472993850708, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1910 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 0.590728759765625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 1920 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 0.4205126166343689, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1930 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 0.47869905829429626, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 1940 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 0.4607323408126831, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 1950 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 0.4762210547924042, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1960 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 0.46832647919654846, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1970 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 0.4368574619293213, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 1980 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 0.5248273611068726, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 1990 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 0.46777117252349854, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2000 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 0.5201858878135681, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 0.46777284145355225, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 2020 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 0.46736642718315125, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2030 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 0.4647925794124603, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2040 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 0.4298803508281708, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 2050 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 0.45485609769821167, + "learning_rate": 0.0002, + "loss": 1.6648, + "step": 2060 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 0.43687865138053894, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2070 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 0.4319164752960205, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 2080 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 0.47792428731918335, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 2090 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 0.5322234034538269, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2100 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 0.47517943382263184, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 2110 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 0.45799025893211365, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 2120 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 0.45852357149124146, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2130 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 0.4617408514022827, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2140 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 0.44205963611602783, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2150 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 0.47173425555229187, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2160 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 0.46379899978637695, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 2170 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 0.4999759793281555, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2180 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 0.4607947766780853, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 2190 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 0.4359836280345917, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 2200 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 0.5195549726486206, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 2210 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 0.4914056062698364, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2220 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 0.4647377133369446, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2230 + }, + { + "epoch": 3.0, + "eval_loss": 1.8368606567382812, + "eval_runtime": 90.5623, + "eval_samples_per_second": 5.687, + "eval_steps_per_second": 0.718, + "step": 2238 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0356962891504026e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2238/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2ccb1cae122a7713133130af0bb43690870daab4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd82fb00ec44f82134ab53c4abe034c4c84c74035746c0dff5b132a2d7e394c4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8ae3a25cbbed5d0bab48fbb89ebe520e536d74c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4268ae966f41b272bcf22d6deeb0beba3df3870154b03d0156731ba8d448c8a +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2d10ee571d468e31d178e2d0b7df2d4efbbd1dc0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fce74bf3d0141c1de1a087c2b94d31ff85b10514e299700848be7dbffd122f1 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..04241447ebd38efe991981927ab3b326989bbe3e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce20246cfd2876d0285804c402e683665cd97995198d146de8a02657d164840f +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..11ab685d7a0195fcfce53c4b4aff6b4b138dbdcd --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/trainer_state.json @@ -0,0 +1,2151 @@ +{ + "best_metric": 1.8150336742401123, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 2984, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.2998153269290924, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 750 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.34353747963905334, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 760 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.3506847321987152, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 770 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.3434218764305115, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 780 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.39283573627471924, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 790 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.36534103751182556, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 800 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.32713210582733154, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 810 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.4298870861530304, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 820 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.3652895987033844, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 830 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.4341593086719513, + "learning_rate": 0.0002, + "loss": 1.7952, + "step": 840 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.3925093412399292, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 850 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 860 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.36138468980789185, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 870 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.33074072003364563, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 880 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.3552579879760742, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 890 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.38744238018989563, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 900 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.3563305735588074, + "learning_rate": 0.0002, + "loss": 1.7543, + "step": 910 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.35686084628105164, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 920 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.4001927077770233, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 930 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.35909149050712585, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 940 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.35123375058174133, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 950 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.38013333082199097, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 960 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 970 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.4208183288574219, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 980 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.3613564074039459, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 990 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.34058499336242676, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1000 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.3563075065612793, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1010 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.36920854449272156, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 1020 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.3889519274234772, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1030 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.3664555251598358, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1040 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.38175567984580994, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1050 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.42346763610839844, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1060 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.3456033170223236, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1070 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.38931941986083984, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1080 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.5473279356956482, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1090 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.3517422676086426, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 1100 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.3511943221092224, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1110 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.3762837052345276, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 1120 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.37149128317832947, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1130 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.3945842981338501, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 1140 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.40258195996284485, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1150 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.3959120213985443, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 1160 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.37792712450027466, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 1170 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.4019201099872589, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1180 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.40712273120880127, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1190 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.4131423234939575, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 1200 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.3738194704055786, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1210 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.3987765908241272, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.34117406606674194, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1230 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.34900516271591187, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 1240 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.35759788751602173, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 1250 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.3837822377681732, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1260 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.3671180307865143, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1270 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.4124658703804016, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 1280 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.39059901237487793, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 1290 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.4006287157535553, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1300 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.3606216013431549, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 1310 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.3861924111843109, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1320 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.41432589292526245, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1330 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.3751705586910248, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 1340 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.36217355728149414, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1350 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.35937434434890747, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1360 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.36120304465293884, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 1370 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.36082401871681213, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 1380 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.3616413176059723, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 1390 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.3664911091327667, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1400 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.3545122444629669, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1410 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.38186976313591003, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1420 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.41099944710731506, + "learning_rate": 0.0002, + "loss": 1.788, + "step": 1430 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.34538620710372925, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1440 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.35443663597106934, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 1450 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.4783519208431244, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1460 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.36285310983657837, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1470 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.361730694770813, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 1480 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.38347867131233215, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8150336742401123, + "eval_runtime": 91.1797, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 0.713, + "step": 1492 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 0.3648935854434967, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1500 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 0.3521469533443451, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 1510 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 0.4275520145893097, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 1520 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 0.4140888750553131, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1530 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 0.37715452909469604, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 1540 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 0.4375513195991516, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 1550 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 0.44963088631629944, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1560 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 0.45463916659355164, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 1570 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 0.3952806293964386, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1580 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 0.44873616099357605, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1590 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 0.45529067516326904, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1600 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 0.4483625590801239, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1610 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 0.3954690992832184, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1620 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 0.4297006130218506, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 1630 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 0.4121869206428528, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 1640 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 0.45843517780303955, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1650 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 0.44742295145988464, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1660 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 0.500198483467102, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 1670 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 0.4322265386581421, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 1680 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 0.480289101600647, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1690 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 0.4532500207424164, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 1700 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 0.41848474740982056, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1710 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 0.47211962938308716, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 1720 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 0.4273032248020172, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1730 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 0.4660373330116272, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1740 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 0.4409862756729126, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 1750 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 0.44795849919319153, + "learning_rate": 0.0002, + "loss": 1.6579, + "step": 1760 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 0.4470100402832031, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 1770 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 0.4184521436691284, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1780 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 0.4572308659553528, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1790 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 0.4888782501220703, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 1800 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 0.4442083239555359, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1810 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 0.4986329972743988, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1820 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 0.47918054461479187, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1830 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 0.42569679021835327, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1840 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 0.4683821201324463, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 1850 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 0.43605074286460876, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 1860 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 0.4189167618751526, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 1870 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 0.5860861539840698, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1880 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 0.4568740427494049, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1890 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 0.4672846496105194, + "learning_rate": 0.0002, + "loss": 1.6653, + "step": 1900 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 0.4280472993850708, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1910 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 0.590728759765625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 1920 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 0.4205126166343689, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1930 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 0.47869905829429626, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 1940 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 0.4607323408126831, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 1950 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 0.4762210547924042, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1960 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 0.46832647919654846, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1970 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 0.4368574619293213, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 1980 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 0.5248273611068726, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 1990 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 0.46777117252349854, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2000 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 0.5201858878135681, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 0.46777284145355225, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 2020 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 0.46736642718315125, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2030 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 0.4647925794124603, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2040 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 0.4298803508281708, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 2050 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 0.45485609769821167, + "learning_rate": 0.0002, + "loss": 1.6648, + "step": 2060 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 0.43687865138053894, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2070 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 0.4319164752960205, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 2080 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 0.47792428731918335, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 2090 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 0.5322234034538269, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2100 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 0.47517943382263184, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 2110 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 0.45799025893211365, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 2120 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 0.45852357149124146, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2130 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 0.4617408514022827, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2140 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 0.44205963611602783, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2150 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 0.47173425555229187, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2160 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 0.46379899978637695, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 2170 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 0.4999759793281555, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2180 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 0.4607947766780853, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 2190 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 0.4359836280345917, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 2200 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 0.5195549726486206, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 2210 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 0.4914056062698364, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2220 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 0.4647377133369446, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2230 + }, + { + "epoch": 3.0, + "eval_loss": 1.8368606567382812, + "eval_runtime": 90.5623, + "eval_samples_per_second": 5.687, + "eval_steps_per_second": 0.718, + "step": 2238 + }, + { + "epoch": 3.002680965147453, + "grad_norm": 0.40689945220947266, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 2240 + }, + { + "epoch": 3.0160857908847185, + "grad_norm": 0.4699273705482483, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 2250 + }, + { + "epoch": 3.0294906166219837, + "grad_norm": 0.5531830787658691, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 2260 + }, + { + "epoch": 3.0428954423592494, + "grad_norm": 0.5441790223121643, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 2270 + }, + { + "epoch": 3.0563002680965146, + "grad_norm": 0.6145012974739075, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2280 + }, + { + "epoch": 3.06970509383378, + "grad_norm": 0.6997102499008179, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 2290 + }, + { + "epoch": 3.0831099195710454, + "grad_norm": 0.6082330942153931, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2300 + }, + { + "epoch": 3.096514745308311, + "grad_norm": 0.5294155478477478, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 2310 + }, + { + "epoch": 3.1099195710455763, + "grad_norm": 0.7200340032577515, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2320 + }, + { + "epoch": 3.123324396782842, + "grad_norm": 0.721092939376831, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 2330 + }, + { + "epoch": 3.136729222520107, + "grad_norm": 0.5344305038452148, + "learning_rate": 0.0002, + "loss": 1.5307, + "step": 2340 + }, + { + "epoch": 3.1501340482573728, + "grad_norm": 0.5533145070075989, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2350 + }, + { + "epoch": 3.163538873994638, + "grad_norm": 0.5976856350898743, + "learning_rate": 0.0002, + "loss": 1.529, + "step": 2360 + }, + { + "epoch": 3.1769436997319036, + "grad_norm": 0.4974960386753082, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2370 + }, + { + "epoch": 3.190348525469169, + "grad_norm": 0.6377840042114258, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 2380 + }, + { + "epoch": 3.2037533512064345, + "grad_norm": 0.5447293519973755, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2390 + }, + { + "epoch": 3.2171581769436997, + "grad_norm": 0.49577030539512634, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2400 + }, + { + "epoch": 3.2305630026809653, + "grad_norm": 0.5588275790214539, + "learning_rate": 0.0002, + "loss": 1.4768, + "step": 2410 + }, + { + "epoch": 3.2439678284182305, + "grad_norm": 0.6429149508476257, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 2420 + }, + { + "epoch": 3.257372654155496, + "grad_norm": 0.5713154673576355, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2430 + }, + { + "epoch": 3.2707774798927614, + "grad_norm": 0.6348955035209656, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 2440 + }, + { + "epoch": 3.284182305630027, + "grad_norm": 0.5675528645515442, + "learning_rate": 0.0002, + "loss": 1.509, + "step": 2450 + }, + { + "epoch": 3.297587131367292, + "grad_norm": 0.5570188164710999, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2460 + }, + { + "epoch": 3.310991957104558, + "grad_norm": 0.6029602289199829, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 2470 + }, + { + "epoch": 3.324396782841823, + "grad_norm": 0.523206353187561, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2480 + }, + { + "epoch": 3.3378016085790883, + "grad_norm": 0.5912408828735352, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2490 + }, + { + "epoch": 3.351206434316354, + "grad_norm": 0.5524865984916687, + "learning_rate": 0.0002, + "loss": 1.5097, + "step": 2500 + }, + { + "epoch": 3.3646112600536195, + "grad_norm": 0.60386061668396, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 2510 + }, + { + "epoch": 3.3780160857908847, + "grad_norm": 0.5838595628738403, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2520 + }, + { + "epoch": 3.39142091152815, + "grad_norm": 0.5400974154472351, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2530 + }, + { + "epoch": 3.4048257372654156, + "grad_norm": 0.6150162220001221, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 2540 + }, + { + "epoch": 3.418230563002681, + "grad_norm": 0.5279412269592285, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2550 + }, + { + "epoch": 3.4316353887399464, + "grad_norm": 0.5974063873291016, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 2560 + }, + { + "epoch": 3.4450402144772116, + "grad_norm": 0.661573052406311, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2570 + }, + { + "epoch": 3.4584450402144773, + "grad_norm": 0.577880322933197, + "learning_rate": 0.0002, + "loss": 1.5204, + "step": 2580 + }, + { + "epoch": 3.4718498659517425, + "grad_norm": 0.5532318949699402, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 2590 + }, + { + "epoch": 3.485254691689008, + "grad_norm": 0.5764921307563782, + "learning_rate": 0.0002, + "loss": 1.4933, + "step": 2600 + }, + { + "epoch": 3.4986595174262733, + "grad_norm": 0.6145682334899902, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2610 + }, + { + "epoch": 3.512064343163539, + "grad_norm": 0.6561126112937927, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 2620 + }, + { + "epoch": 3.525469168900804, + "grad_norm": 0.5673288106918335, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2630 + }, + { + "epoch": 3.53887399463807, + "grad_norm": 0.6215338706970215, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 2640 + }, + { + "epoch": 3.552278820375335, + "grad_norm": 0.5512040853500366, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2650 + }, + { + "epoch": 3.5656836461126007, + "grad_norm": 0.49503496289253235, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 2660 + }, + { + "epoch": 3.579088471849866, + "grad_norm": 0.5714912414550781, + "learning_rate": 0.0002, + "loss": 1.524, + "step": 2670 + }, + { + "epoch": 3.592493297587131, + "grad_norm": 0.6883154511451721, + "learning_rate": 0.0002, + "loss": 1.4651, + "step": 2680 + }, + { + "epoch": 3.6058981233243967, + "grad_norm": 0.5989556908607483, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2690 + }, + { + "epoch": 3.6193029490616624, + "grad_norm": 0.630268394947052, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 2700 + }, + { + "epoch": 3.6327077747989276, + "grad_norm": 0.5819358229637146, + "learning_rate": 0.0002, + "loss": 1.4681, + "step": 2710 + }, + { + "epoch": 3.646112600536193, + "grad_norm": 0.6102097034454346, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 2720 + }, + { + "epoch": 3.6595174262734584, + "grad_norm": 0.6858501434326172, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 2730 + }, + { + "epoch": 3.672922252010724, + "grad_norm": 0.6328608393669128, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 2740 + }, + { + "epoch": 3.6863270777479893, + "grad_norm": 0.5366981029510498, + "learning_rate": 0.0002, + "loss": 1.5211, + "step": 2750 + }, + { + "epoch": 3.6997319034852545, + "grad_norm": 0.7048938274383545, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 2760 + }, + { + "epoch": 3.71313672922252, + "grad_norm": 0.5371938347816467, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2770 + }, + { + "epoch": 3.726541554959786, + "grad_norm": 0.6142212152481079, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 2780 + }, + { + "epoch": 3.739946380697051, + "grad_norm": 0.6164522171020508, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 2790 + }, + { + "epoch": 3.753351206434316, + "grad_norm": 0.7511836886405945, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 2800 + }, + { + "epoch": 3.766756032171582, + "grad_norm": 0.6194717288017273, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2810 + }, + { + "epoch": 3.780160857908847, + "grad_norm": 0.676721453666687, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2820 + }, + { + "epoch": 3.7935656836461127, + "grad_norm": 0.5646911263465881, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 2830 + }, + { + "epoch": 3.806970509383378, + "grad_norm": 0.5874826908111572, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 2840 + }, + { + "epoch": 3.8203753351206435, + "grad_norm": 0.6395232677459717, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2850 + }, + { + "epoch": 3.8337801608579087, + "grad_norm": 0.624563992023468, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2860 + }, + { + "epoch": 3.8471849865951744, + "grad_norm": 0.59019935131073, + "learning_rate": 0.0002, + "loss": 1.479, + "step": 2870 + }, + { + "epoch": 3.8605898123324396, + "grad_norm": 0.6700479984283447, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2880 + }, + { + "epoch": 3.8739946380697052, + "grad_norm": 0.6131282448768616, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 2890 + }, + { + "epoch": 3.8873994638069704, + "grad_norm": 0.6807777881622314, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 2900 + }, + { + "epoch": 3.900804289544236, + "grad_norm": 0.5297217965126038, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2910 + }, + { + "epoch": 3.9142091152815013, + "grad_norm": 0.5795540809631348, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2920 + }, + { + "epoch": 3.927613941018767, + "grad_norm": 0.5549747347831726, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 2930 + }, + { + "epoch": 3.941018766756032, + "grad_norm": 0.5895092487335205, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 2940 + }, + { + "epoch": 3.9544235924932973, + "grad_norm": 0.590002715587616, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2950 + }, + { + "epoch": 3.967828418230563, + "grad_norm": 0.7847695350646973, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 2960 + }, + { + "epoch": 3.9812332439678286, + "grad_norm": 0.5845848321914673, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 2970 + }, + { + "epoch": 3.994638069705094, + "grad_norm": 0.5861571431159973, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2980 + }, + { + "epoch": 4.0, + "eval_loss": 1.8821998834609985, + "eval_runtime": 90.8701, + "eval_samples_per_second": 5.667, + "eval_steps_per_second": 0.715, + "step": 2984 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.38092838553387e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-2984/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b2bd861997653a66b52a2543926913e5b3ef7b4b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b51b94b6ba38fce2702592535f9cce1eecc13d958d3c7dfd45ef19de01201e4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d88619973661a2d1396eff8270bbdd3f94a9d72 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88922759b4ab1a48d14020e635f462c741d1f83eb6e414fae6131ae30dd3ca44 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2c801a598c32f4e2ea8fae8b42855a32ced7886 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:558ac63cedbb87bf2109558c8cc6101d03ce73a1e0b23208f282c8841079f287 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4076401dfe97b2d7e9dad1fb8e883505479714c8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d58ef4ba38f4f1bfd34df871f7104dead2f5243daa15b007f8c032e9de63d5d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74ca6059ea527fa1dd9ff5f0b2a1e1a4eb20c53b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/trainer_state.json @@ -0,0 +1,2684 @@ +{ + "best_metric": 1.8150336742401123, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 3730, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.2998153269290924, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 750 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.34353747963905334, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 760 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.3506847321987152, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 770 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.3434218764305115, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 780 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.39283573627471924, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 790 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.36534103751182556, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 800 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.32713210582733154, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 810 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.4298870861530304, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 820 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.3652895987033844, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 830 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.4341593086719513, + "learning_rate": 0.0002, + "loss": 1.7952, + "step": 840 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.3925093412399292, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 850 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 860 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.36138468980789185, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 870 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.33074072003364563, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 880 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.3552579879760742, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 890 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.38744238018989563, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 900 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.3563305735588074, + "learning_rate": 0.0002, + "loss": 1.7543, + "step": 910 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.35686084628105164, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 920 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.4001927077770233, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 930 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.35909149050712585, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 940 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.35123375058174133, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 950 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.38013333082199097, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 960 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 970 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.4208183288574219, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 980 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.3613564074039459, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 990 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.34058499336242676, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1000 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.3563075065612793, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1010 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.36920854449272156, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 1020 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.3889519274234772, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1030 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.3664555251598358, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1040 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.38175567984580994, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1050 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.42346763610839844, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1060 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.3456033170223236, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1070 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.38931941986083984, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1080 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.5473279356956482, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1090 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.3517422676086426, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 1100 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.3511943221092224, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1110 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.3762837052345276, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 1120 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.37149128317832947, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1130 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.3945842981338501, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 1140 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.40258195996284485, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1150 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.3959120213985443, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 1160 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.37792712450027466, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 1170 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.4019201099872589, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1180 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.40712273120880127, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1190 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.4131423234939575, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 1200 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.3738194704055786, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1210 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.3987765908241272, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.34117406606674194, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1230 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.34900516271591187, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 1240 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.35759788751602173, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 1250 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.3837822377681732, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1260 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.3671180307865143, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1270 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.4124658703804016, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 1280 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.39059901237487793, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 1290 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.4006287157535553, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1300 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.3606216013431549, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 1310 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.3861924111843109, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1320 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.41432589292526245, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1330 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.3751705586910248, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 1340 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.36217355728149414, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1350 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.35937434434890747, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1360 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.36120304465293884, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 1370 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.36082401871681213, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 1380 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.3616413176059723, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 1390 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.3664911091327667, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1400 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.3545122444629669, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1410 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.38186976313591003, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1420 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.41099944710731506, + "learning_rate": 0.0002, + "loss": 1.788, + "step": 1430 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.34538620710372925, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1440 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.35443663597106934, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 1450 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.4783519208431244, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1460 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.36285310983657837, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1470 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.361730694770813, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 1480 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.38347867131233215, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8150336742401123, + "eval_runtime": 91.1797, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 0.713, + "step": 1492 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 0.3648935854434967, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1500 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 0.3521469533443451, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 1510 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 0.4275520145893097, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 1520 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 0.4140888750553131, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1530 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 0.37715452909469604, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 1540 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 0.4375513195991516, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 1550 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 0.44963088631629944, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1560 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 0.45463916659355164, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 1570 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 0.3952806293964386, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1580 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 0.44873616099357605, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1590 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 0.45529067516326904, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1600 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 0.4483625590801239, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1610 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 0.3954690992832184, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1620 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 0.4297006130218506, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 1630 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 0.4121869206428528, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 1640 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 0.45843517780303955, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1650 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 0.44742295145988464, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1660 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 0.500198483467102, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 1670 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 0.4322265386581421, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 1680 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 0.480289101600647, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1690 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 0.4532500207424164, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 1700 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 0.41848474740982056, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1710 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 0.47211962938308716, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 1720 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 0.4273032248020172, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1730 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 0.4660373330116272, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1740 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 0.4409862756729126, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 1750 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 0.44795849919319153, + "learning_rate": 0.0002, + "loss": 1.6579, + "step": 1760 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 0.4470100402832031, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 1770 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 0.4184521436691284, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1780 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 0.4572308659553528, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1790 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 0.4888782501220703, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 1800 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 0.4442083239555359, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1810 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 0.4986329972743988, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1820 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 0.47918054461479187, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1830 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 0.42569679021835327, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1840 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 0.4683821201324463, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 1850 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 0.43605074286460876, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 1860 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 0.4189167618751526, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 1870 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 0.5860861539840698, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1880 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 0.4568740427494049, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1890 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 0.4672846496105194, + "learning_rate": 0.0002, + "loss": 1.6653, + "step": 1900 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 0.4280472993850708, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1910 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 0.590728759765625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 1920 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 0.4205126166343689, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1930 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 0.47869905829429626, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 1940 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 0.4607323408126831, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 1950 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 0.4762210547924042, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1960 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 0.46832647919654846, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1970 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 0.4368574619293213, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 1980 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 0.5248273611068726, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 1990 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 0.46777117252349854, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2000 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 0.5201858878135681, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 0.46777284145355225, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 2020 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 0.46736642718315125, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2030 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 0.4647925794124603, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2040 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 0.4298803508281708, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 2050 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 0.45485609769821167, + "learning_rate": 0.0002, + "loss": 1.6648, + "step": 2060 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 0.43687865138053894, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2070 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 0.4319164752960205, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 2080 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 0.47792428731918335, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 2090 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 0.5322234034538269, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2100 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 0.47517943382263184, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 2110 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 0.45799025893211365, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 2120 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 0.45852357149124146, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2130 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 0.4617408514022827, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2140 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 0.44205963611602783, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2150 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 0.47173425555229187, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2160 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 0.46379899978637695, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 2170 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 0.4999759793281555, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2180 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 0.4607947766780853, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 2190 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 0.4359836280345917, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 2200 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 0.5195549726486206, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 2210 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 0.4914056062698364, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2220 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 0.4647377133369446, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2230 + }, + { + "epoch": 3.0, + "eval_loss": 1.8368606567382812, + "eval_runtime": 90.5623, + "eval_samples_per_second": 5.687, + "eval_steps_per_second": 0.718, + "step": 2238 + }, + { + "epoch": 3.002680965147453, + "grad_norm": 0.40689945220947266, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 2240 + }, + { + "epoch": 3.0160857908847185, + "grad_norm": 0.4699273705482483, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 2250 + }, + { + "epoch": 3.0294906166219837, + "grad_norm": 0.5531830787658691, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 2260 + }, + { + "epoch": 3.0428954423592494, + "grad_norm": 0.5441790223121643, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 2270 + }, + { + "epoch": 3.0563002680965146, + "grad_norm": 0.6145012974739075, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2280 + }, + { + "epoch": 3.06970509383378, + "grad_norm": 0.6997102499008179, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 2290 + }, + { + "epoch": 3.0831099195710454, + "grad_norm": 0.6082330942153931, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2300 + }, + { + "epoch": 3.096514745308311, + "grad_norm": 0.5294155478477478, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 2310 + }, + { + "epoch": 3.1099195710455763, + "grad_norm": 0.7200340032577515, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2320 + }, + { + "epoch": 3.123324396782842, + "grad_norm": 0.721092939376831, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 2330 + }, + { + "epoch": 3.136729222520107, + "grad_norm": 0.5344305038452148, + "learning_rate": 0.0002, + "loss": 1.5307, + "step": 2340 + }, + { + "epoch": 3.1501340482573728, + "grad_norm": 0.5533145070075989, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2350 + }, + { + "epoch": 3.163538873994638, + "grad_norm": 0.5976856350898743, + "learning_rate": 0.0002, + "loss": 1.529, + "step": 2360 + }, + { + "epoch": 3.1769436997319036, + "grad_norm": 0.4974960386753082, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2370 + }, + { + "epoch": 3.190348525469169, + "grad_norm": 0.6377840042114258, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 2380 + }, + { + "epoch": 3.2037533512064345, + "grad_norm": 0.5447293519973755, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2390 + }, + { + "epoch": 3.2171581769436997, + "grad_norm": 0.49577030539512634, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2400 + }, + { + "epoch": 3.2305630026809653, + "grad_norm": 0.5588275790214539, + "learning_rate": 0.0002, + "loss": 1.4768, + "step": 2410 + }, + { + "epoch": 3.2439678284182305, + "grad_norm": 0.6429149508476257, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 2420 + }, + { + "epoch": 3.257372654155496, + "grad_norm": 0.5713154673576355, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2430 + }, + { + "epoch": 3.2707774798927614, + "grad_norm": 0.6348955035209656, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 2440 + }, + { + "epoch": 3.284182305630027, + "grad_norm": 0.5675528645515442, + "learning_rate": 0.0002, + "loss": 1.509, + "step": 2450 + }, + { + "epoch": 3.297587131367292, + "grad_norm": 0.5570188164710999, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2460 + }, + { + "epoch": 3.310991957104558, + "grad_norm": 0.6029602289199829, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 2470 + }, + { + "epoch": 3.324396782841823, + "grad_norm": 0.523206353187561, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2480 + }, + { + "epoch": 3.3378016085790883, + "grad_norm": 0.5912408828735352, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2490 + }, + { + "epoch": 3.351206434316354, + "grad_norm": 0.5524865984916687, + "learning_rate": 0.0002, + "loss": 1.5097, + "step": 2500 + }, + { + "epoch": 3.3646112600536195, + "grad_norm": 0.60386061668396, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 2510 + }, + { + "epoch": 3.3780160857908847, + "grad_norm": 0.5838595628738403, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2520 + }, + { + "epoch": 3.39142091152815, + "grad_norm": 0.5400974154472351, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2530 + }, + { + "epoch": 3.4048257372654156, + "grad_norm": 0.6150162220001221, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 2540 + }, + { + "epoch": 3.418230563002681, + "grad_norm": 0.5279412269592285, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2550 + }, + { + "epoch": 3.4316353887399464, + "grad_norm": 0.5974063873291016, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 2560 + }, + { + "epoch": 3.4450402144772116, + "grad_norm": 0.661573052406311, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2570 + }, + { + "epoch": 3.4584450402144773, + "grad_norm": 0.577880322933197, + "learning_rate": 0.0002, + "loss": 1.5204, + "step": 2580 + }, + { + "epoch": 3.4718498659517425, + "grad_norm": 0.5532318949699402, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 2590 + }, + { + "epoch": 3.485254691689008, + "grad_norm": 0.5764921307563782, + "learning_rate": 0.0002, + "loss": 1.4933, + "step": 2600 + }, + { + "epoch": 3.4986595174262733, + "grad_norm": 0.6145682334899902, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2610 + }, + { + "epoch": 3.512064343163539, + "grad_norm": 0.6561126112937927, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 2620 + }, + { + "epoch": 3.525469168900804, + "grad_norm": 0.5673288106918335, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2630 + }, + { + "epoch": 3.53887399463807, + "grad_norm": 0.6215338706970215, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 2640 + }, + { + "epoch": 3.552278820375335, + "grad_norm": 0.5512040853500366, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2650 + }, + { + "epoch": 3.5656836461126007, + "grad_norm": 0.49503496289253235, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 2660 + }, + { + "epoch": 3.579088471849866, + "grad_norm": 0.5714912414550781, + "learning_rate": 0.0002, + "loss": 1.524, + "step": 2670 + }, + { + "epoch": 3.592493297587131, + "grad_norm": 0.6883154511451721, + "learning_rate": 0.0002, + "loss": 1.4651, + "step": 2680 + }, + { + "epoch": 3.6058981233243967, + "grad_norm": 0.5989556908607483, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2690 + }, + { + "epoch": 3.6193029490616624, + "grad_norm": 0.630268394947052, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 2700 + }, + { + "epoch": 3.6327077747989276, + "grad_norm": 0.5819358229637146, + "learning_rate": 0.0002, + "loss": 1.4681, + "step": 2710 + }, + { + "epoch": 3.646112600536193, + "grad_norm": 0.6102097034454346, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 2720 + }, + { + "epoch": 3.6595174262734584, + "grad_norm": 0.6858501434326172, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 2730 + }, + { + "epoch": 3.672922252010724, + "grad_norm": 0.6328608393669128, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 2740 + }, + { + "epoch": 3.6863270777479893, + "grad_norm": 0.5366981029510498, + "learning_rate": 0.0002, + "loss": 1.5211, + "step": 2750 + }, + { + "epoch": 3.6997319034852545, + "grad_norm": 0.7048938274383545, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 2760 + }, + { + "epoch": 3.71313672922252, + "grad_norm": 0.5371938347816467, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2770 + }, + { + "epoch": 3.726541554959786, + "grad_norm": 0.6142212152481079, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 2780 + }, + { + "epoch": 3.739946380697051, + "grad_norm": 0.6164522171020508, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 2790 + }, + { + "epoch": 3.753351206434316, + "grad_norm": 0.7511836886405945, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 2800 + }, + { + "epoch": 3.766756032171582, + "grad_norm": 0.6194717288017273, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2810 + }, + { + "epoch": 3.780160857908847, + "grad_norm": 0.676721453666687, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2820 + }, + { + "epoch": 3.7935656836461127, + "grad_norm": 0.5646911263465881, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 2830 + }, + { + "epoch": 3.806970509383378, + "grad_norm": 0.5874826908111572, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 2840 + }, + { + "epoch": 3.8203753351206435, + "grad_norm": 0.6395232677459717, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2850 + }, + { + "epoch": 3.8337801608579087, + "grad_norm": 0.624563992023468, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2860 + }, + { + "epoch": 3.8471849865951744, + "grad_norm": 0.59019935131073, + "learning_rate": 0.0002, + "loss": 1.479, + "step": 2870 + }, + { + "epoch": 3.8605898123324396, + "grad_norm": 0.6700479984283447, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2880 + }, + { + "epoch": 3.8739946380697052, + "grad_norm": 0.6131282448768616, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 2890 + }, + { + "epoch": 3.8873994638069704, + "grad_norm": 0.6807777881622314, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 2900 + }, + { + "epoch": 3.900804289544236, + "grad_norm": 0.5297217965126038, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2910 + }, + { + "epoch": 3.9142091152815013, + "grad_norm": 0.5795540809631348, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2920 + }, + { + "epoch": 3.927613941018767, + "grad_norm": 0.5549747347831726, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 2930 + }, + { + "epoch": 3.941018766756032, + "grad_norm": 0.5895092487335205, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 2940 + }, + { + "epoch": 3.9544235924932973, + "grad_norm": 0.590002715587616, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2950 + }, + { + "epoch": 3.967828418230563, + "grad_norm": 0.7847695350646973, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 2960 + }, + { + "epoch": 3.9812332439678286, + "grad_norm": 0.5845848321914673, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 2970 + }, + { + "epoch": 3.994638069705094, + "grad_norm": 0.5861571431159973, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2980 + }, + { + "epoch": 4.0, + "eval_loss": 1.8821998834609985, + "eval_runtime": 90.8701, + "eval_samples_per_second": 5.667, + "eval_steps_per_second": 0.715, + "step": 2984 + }, + { + "epoch": 4.008042895442359, + "grad_norm": 0.6209918260574341, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 2990 + }, + { + "epoch": 4.021447721179625, + "grad_norm": 0.607226550579071, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3000 + }, + { + "epoch": 4.03485254691689, + "grad_norm": 0.6677961349487305, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 3010 + }, + { + "epoch": 4.048257372654155, + "grad_norm": 0.9053248763084412, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 3020 + }, + { + "epoch": 4.061662198391421, + "grad_norm": 0.6815084218978882, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3030 + }, + { + "epoch": 4.075067024128686, + "grad_norm": 0.6709407567977905, + "learning_rate": 0.0002, + "loss": 1.3, + "step": 3040 + }, + { + "epoch": 4.088471849865952, + "grad_norm": 0.728184163570404, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 3050 + }, + { + "epoch": 4.101876675603217, + "grad_norm": 0.817628800868988, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3060 + }, + { + "epoch": 4.115281501340482, + "grad_norm": 0.7384206056594849, + "learning_rate": 0.0002, + "loss": 1.3496, + "step": 3070 + }, + { + "epoch": 4.128686327077748, + "grad_norm": 0.7380280494689941, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3080 + }, + { + "epoch": 4.142091152815014, + "grad_norm": 0.8197277188301086, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3090 + }, + { + "epoch": 4.1554959785522785, + "grad_norm": 0.8971617817878723, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3100 + }, + { + "epoch": 4.168900804289544, + "grad_norm": 0.7409387826919556, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 3110 + }, + { + "epoch": 4.18230563002681, + "grad_norm": 0.6948909163475037, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3120 + }, + { + "epoch": 4.195710455764075, + "grad_norm": 0.7619595527648926, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3130 + }, + { + "epoch": 4.20911528150134, + "grad_norm": 0.7657106518745422, + "learning_rate": 0.0002, + "loss": 1.3864, + "step": 3140 + }, + { + "epoch": 4.222520107238606, + "grad_norm": 0.6919401288032532, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 3150 + }, + { + "epoch": 4.2359249329758715, + "grad_norm": 0.6991415023803711, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3160 + }, + { + "epoch": 4.249329758713137, + "grad_norm": 0.7349252700805664, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3170 + }, + { + "epoch": 4.262734584450402, + "grad_norm": 0.8838240504264832, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3180 + }, + { + "epoch": 4.2761394101876675, + "grad_norm": 0.7240107655525208, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3190 + }, + { + "epoch": 4.289544235924933, + "grad_norm": 0.7338636517524719, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3200 + }, + { + "epoch": 4.302949061662199, + "grad_norm": 0.7891436815261841, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3210 + }, + { + "epoch": 4.316353887399464, + "grad_norm": 0.7407845854759216, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 3220 + }, + { + "epoch": 4.329758713136729, + "grad_norm": 0.7635948061943054, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3230 + }, + { + "epoch": 4.343163538873995, + "grad_norm": 0.7478461861610413, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 3240 + }, + { + "epoch": 4.35656836461126, + "grad_norm": 0.7684298157691956, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 3250 + }, + { + "epoch": 4.369973190348525, + "grad_norm": 1.0287525653839111, + "learning_rate": 0.0002, + "loss": 1.4233, + "step": 3260 + }, + { + "epoch": 4.383378016085791, + "grad_norm": 0.750616192817688, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3270 + }, + { + "epoch": 4.396782841823057, + "grad_norm": 0.7911648750305176, + "learning_rate": 0.0002, + "loss": 1.3158, + "step": 3280 + }, + { + "epoch": 4.410187667560321, + "grad_norm": 0.9156750440597534, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 3290 + }, + { + "epoch": 4.423592493297587, + "grad_norm": 1.0180249214172363, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 3300 + }, + { + "epoch": 4.436997319034853, + "grad_norm": 1.0792218446731567, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 3310 + }, + { + "epoch": 4.450402144772118, + "grad_norm": 0.8027488589286804, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 3320 + }, + { + "epoch": 4.463806970509383, + "grad_norm": 0.8037815093994141, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3330 + }, + { + "epoch": 4.477211796246649, + "grad_norm": 0.7907946705818176, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3340 + }, + { + "epoch": 4.490616621983914, + "grad_norm": 0.7206302881240845, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3350 + }, + { + "epoch": 4.50402144772118, + "grad_norm": 0.7697674632072449, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 3360 + }, + { + "epoch": 4.517426273458445, + "grad_norm": 0.7315130829811096, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 3370 + }, + { + "epoch": 4.53083109919571, + "grad_norm": 0.7896273136138916, + "learning_rate": 0.0002, + "loss": 1.3598, + "step": 3380 + }, + { + "epoch": 4.544235924932976, + "grad_norm": 0.7720345258712769, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 3390 + }, + { + "epoch": 4.557640750670242, + "grad_norm": 0.8304631114006042, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 3400 + }, + { + "epoch": 4.571045576407506, + "grad_norm": 0.7408214211463928, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 3410 + }, + { + "epoch": 4.584450402144772, + "grad_norm": 0.8100157976150513, + "learning_rate": 0.0002, + "loss": 1.3957, + "step": 3420 + }, + { + "epoch": 4.597855227882038, + "grad_norm": 0.7829574942588806, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 3430 + }, + { + "epoch": 4.6112600536193025, + "grad_norm": 0.9529728889465332, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3440 + }, + { + "epoch": 4.624664879356568, + "grad_norm": 1.0769460201263428, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 3450 + }, + { + "epoch": 4.638069705093834, + "grad_norm": 0.8941947817802429, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3460 + }, + { + "epoch": 4.651474530831099, + "grad_norm": 0.7860096096992493, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 3470 + }, + { + "epoch": 4.664879356568365, + "grad_norm": 0.8184044361114502, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 3480 + }, + { + "epoch": 4.67828418230563, + "grad_norm": 0.7852717638015747, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3490 + }, + { + "epoch": 4.6916890080428955, + "grad_norm": 0.750586986541748, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3500 + }, + { + "epoch": 4.705093833780161, + "grad_norm": 0.7966068983078003, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 3510 + }, + { + "epoch": 4.718498659517426, + "grad_norm": 0.8387030959129333, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 3520 + }, + { + "epoch": 4.7319034852546915, + "grad_norm": 0.7373180389404297, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3530 + }, + { + "epoch": 4.745308310991957, + "grad_norm": 0.8415353894233704, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 3540 + }, + { + "epoch": 4.758713136729223, + "grad_norm": 0.7155488133430481, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 3550 + }, + { + "epoch": 4.772117962466488, + "grad_norm": 0.697658896446228, + "learning_rate": 0.0002, + "loss": 1.3454, + "step": 3560 + }, + { + "epoch": 4.785522788203753, + "grad_norm": 0.8722999095916748, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3570 + }, + { + "epoch": 4.798927613941019, + "grad_norm": 0.8106381297111511, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3580 + }, + { + "epoch": 4.8123324396782845, + "grad_norm": 0.9320500493049622, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 3590 + }, + { + "epoch": 4.825737265415549, + "grad_norm": 0.7583016157150269, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3600 + }, + { + "epoch": 4.839142091152815, + "grad_norm": 0.790050208568573, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3610 + }, + { + "epoch": 4.8525469168900806, + "grad_norm": 0.7481580972671509, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3620 + }, + { + "epoch": 4.865951742627346, + "grad_norm": 0.8709374666213989, + "learning_rate": 0.0002, + "loss": 1.4424, + "step": 3630 + }, + { + "epoch": 4.879356568364611, + "grad_norm": 0.7266733050346375, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 3640 + }, + { + "epoch": 4.892761394101877, + "grad_norm": 0.7669504880905151, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3650 + }, + { + "epoch": 4.906166219839142, + "grad_norm": 0.7855764627456665, + "learning_rate": 0.0002, + "loss": 1.3956, + "step": 3660 + }, + { + "epoch": 4.919571045576408, + "grad_norm": 0.8145440816879272, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3670 + }, + { + "epoch": 4.932975871313673, + "grad_norm": 0.7487278580665588, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 3680 + }, + { + "epoch": 4.946380697050938, + "grad_norm": 0.8390981554985046, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 3690 + }, + { + "epoch": 4.959785522788204, + "grad_norm": 0.663752555847168, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 3700 + }, + { + "epoch": 4.973190348525469, + "grad_norm": 0.7821969985961914, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3710 + }, + { + "epoch": 4.986595174262734, + "grad_norm": 0.9157266020774841, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 3720 + }, + { + "epoch": 5.0, + "grad_norm": 0.7683535814285278, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3730 + }, + { + "epoch": 5.0, + "eval_loss": 1.9639414548873901, + "eval_runtime": 92.0173, + "eval_samples_per_second": 5.597, + "eval_steps_per_second": 0.706, + "step": 3730 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7261604819173376e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-3730/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..16d4b4ac3ff3bab066da6ac2147a634374d06dbb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55abadf17035fe017a255b12bcaaa7717392c4804ec0e240d960706ba006e9ee +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1628905d702c7865753ff82eddcc217563d8c1b5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e4693e26da252f67b63edfb932a9ea4259c7ad924b0836da10ca0d7cacb5f6e +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0acfa4634672486fcc08c21434fb6f4f0093b86 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a63c57ceebf14cd603b3747fdec4031ce8706b1d2f1dc8dfbc48fbd5b0ee7472 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ef43b6db476ecb2c25ef3e3f772c847a8b41ac1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f93b32c30844811e3917b302bd57e288f673746860a1144c95e2efc7ecd6a99 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1bc2de5d27ba27740aa761399181390cb4b7bd90 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/trainer_state.json @@ -0,0 +1,3210 @@ +{ + "best_metric": 1.8150336742401123, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 4476, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.2998153269290924, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 750 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.34353747963905334, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 760 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.3506847321987152, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 770 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.3434218764305115, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 780 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.39283573627471924, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 790 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.36534103751182556, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 800 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.32713210582733154, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 810 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.4298870861530304, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 820 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.3652895987033844, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 830 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.4341593086719513, + "learning_rate": 0.0002, + "loss": 1.7952, + "step": 840 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.3925093412399292, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 850 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 860 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.36138468980789185, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 870 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.33074072003364563, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 880 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.3552579879760742, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 890 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.38744238018989563, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 900 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.3563305735588074, + "learning_rate": 0.0002, + "loss": 1.7543, + "step": 910 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.35686084628105164, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 920 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.4001927077770233, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 930 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.35909149050712585, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 940 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.35123375058174133, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 950 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.38013333082199097, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 960 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 970 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.4208183288574219, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 980 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.3613564074039459, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 990 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.34058499336242676, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1000 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.3563075065612793, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1010 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.36920854449272156, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 1020 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.3889519274234772, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1030 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.3664555251598358, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1040 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.38175567984580994, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1050 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.42346763610839844, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1060 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.3456033170223236, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1070 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.38931941986083984, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1080 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.5473279356956482, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1090 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.3517422676086426, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 1100 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.3511943221092224, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1110 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.3762837052345276, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 1120 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.37149128317832947, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1130 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.3945842981338501, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 1140 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.40258195996284485, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1150 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.3959120213985443, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 1160 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.37792712450027466, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 1170 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.4019201099872589, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1180 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.40712273120880127, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1190 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.4131423234939575, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 1200 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.3738194704055786, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1210 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.3987765908241272, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.34117406606674194, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1230 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.34900516271591187, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 1240 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.35759788751602173, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 1250 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.3837822377681732, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1260 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.3671180307865143, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1270 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.4124658703804016, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 1280 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.39059901237487793, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 1290 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.4006287157535553, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1300 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.3606216013431549, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 1310 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.3861924111843109, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1320 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.41432589292526245, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1330 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.3751705586910248, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 1340 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.36217355728149414, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1350 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.35937434434890747, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1360 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.36120304465293884, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 1370 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.36082401871681213, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 1380 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.3616413176059723, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 1390 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.3664911091327667, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1400 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.3545122444629669, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1410 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.38186976313591003, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1420 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.41099944710731506, + "learning_rate": 0.0002, + "loss": 1.788, + "step": 1430 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.34538620710372925, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1440 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.35443663597106934, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 1450 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.4783519208431244, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1460 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.36285310983657837, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1470 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.361730694770813, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 1480 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.38347867131233215, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8150336742401123, + "eval_runtime": 91.1797, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 0.713, + "step": 1492 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 0.3648935854434967, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1500 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 0.3521469533443451, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 1510 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 0.4275520145893097, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 1520 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 0.4140888750553131, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1530 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 0.37715452909469604, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 1540 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 0.4375513195991516, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 1550 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 0.44963088631629944, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1560 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 0.45463916659355164, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 1570 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 0.3952806293964386, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1580 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 0.44873616099357605, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1590 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 0.45529067516326904, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1600 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 0.4483625590801239, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1610 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 0.3954690992832184, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1620 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 0.4297006130218506, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 1630 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 0.4121869206428528, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 1640 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 0.45843517780303955, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1650 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 0.44742295145988464, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1660 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 0.500198483467102, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 1670 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 0.4322265386581421, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 1680 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 0.480289101600647, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1690 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 0.4532500207424164, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 1700 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 0.41848474740982056, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1710 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 0.47211962938308716, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 1720 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 0.4273032248020172, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1730 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 0.4660373330116272, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1740 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 0.4409862756729126, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 1750 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 0.44795849919319153, + "learning_rate": 0.0002, + "loss": 1.6579, + "step": 1760 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 0.4470100402832031, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 1770 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 0.4184521436691284, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1780 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 0.4572308659553528, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1790 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 0.4888782501220703, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 1800 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 0.4442083239555359, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1810 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 0.4986329972743988, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1820 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 0.47918054461479187, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1830 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 0.42569679021835327, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1840 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 0.4683821201324463, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 1850 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 0.43605074286460876, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 1860 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 0.4189167618751526, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 1870 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 0.5860861539840698, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1880 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 0.4568740427494049, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1890 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 0.4672846496105194, + "learning_rate": 0.0002, + "loss": 1.6653, + "step": 1900 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 0.4280472993850708, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1910 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 0.590728759765625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 1920 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 0.4205126166343689, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1930 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 0.47869905829429626, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 1940 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 0.4607323408126831, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 1950 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 0.4762210547924042, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1960 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 0.46832647919654846, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1970 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 0.4368574619293213, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 1980 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 0.5248273611068726, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 1990 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 0.46777117252349854, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2000 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 0.5201858878135681, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 0.46777284145355225, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 2020 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 0.46736642718315125, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2030 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 0.4647925794124603, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2040 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 0.4298803508281708, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 2050 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 0.45485609769821167, + "learning_rate": 0.0002, + "loss": 1.6648, + "step": 2060 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 0.43687865138053894, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2070 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 0.4319164752960205, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 2080 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 0.47792428731918335, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 2090 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 0.5322234034538269, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2100 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 0.47517943382263184, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 2110 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 0.45799025893211365, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 2120 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 0.45852357149124146, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2130 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 0.4617408514022827, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2140 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 0.44205963611602783, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2150 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 0.47173425555229187, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2160 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 0.46379899978637695, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 2170 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 0.4999759793281555, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2180 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 0.4607947766780853, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 2190 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 0.4359836280345917, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 2200 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 0.5195549726486206, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 2210 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 0.4914056062698364, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2220 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 0.4647377133369446, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2230 + }, + { + "epoch": 3.0, + "eval_loss": 1.8368606567382812, + "eval_runtime": 90.5623, + "eval_samples_per_second": 5.687, + "eval_steps_per_second": 0.718, + "step": 2238 + }, + { + "epoch": 3.002680965147453, + "grad_norm": 0.40689945220947266, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 2240 + }, + { + "epoch": 3.0160857908847185, + "grad_norm": 0.4699273705482483, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 2250 + }, + { + "epoch": 3.0294906166219837, + "grad_norm": 0.5531830787658691, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 2260 + }, + { + "epoch": 3.0428954423592494, + "grad_norm": 0.5441790223121643, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 2270 + }, + { + "epoch": 3.0563002680965146, + "grad_norm": 0.6145012974739075, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2280 + }, + { + "epoch": 3.06970509383378, + "grad_norm": 0.6997102499008179, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 2290 + }, + { + "epoch": 3.0831099195710454, + "grad_norm": 0.6082330942153931, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2300 + }, + { + "epoch": 3.096514745308311, + "grad_norm": 0.5294155478477478, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 2310 + }, + { + "epoch": 3.1099195710455763, + "grad_norm": 0.7200340032577515, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2320 + }, + { + "epoch": 3.123324396782842, + "grad_norm": 0.721092939376831, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 2330 + }, + { + "epoch": 3.136729222520107, + "grad_norm": 0.5344305038452148, + "learning_rate": 0.0002, + "loss": 1.5307, + "step": 2340 + }, + { + "epoch": 3.1501340482573728, + "grad_norm": 0.5533145070075989, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2350 + }, + { + "epoch": 3.163538873994638, + "grad_norm": 0.5976856350898743, + "learning_rate": 0.0002, + "loss": 1.529, + "step": 2360 + }, + { + "epoch": 3.1769436997319036, + "grad_norm": 0.4974960386753082, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2370 + }, + { + "epoch": 3.190348525469169, + "grad_norm": 0.6377840042114258, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 2380 + }, + { + "epoch": 3.2037533512064345, + "grad_norm": 0.5447293519973755, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2390 + }, + { + "epoch": 3.2171581769436997, + "grad_norm": 0.49577030539512634, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2400 + }, + { + "epoch": 3.2305630026809653, + "grad_norm": 0.5588275790214539, + "learning_rate": 0.0002, + "loss": 1.4768, + "step": 2410 + }, + { + "epoch": 3.2439678284182305, + "grad_norm": 0.6429149508476257, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 2420 + }, + { + "epoch": 3.257372654155496, + "grad_norm": 0.5713154673576355, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2430 + }, + { + "epoch": 3.2707774798927614, + "grad_norm": 0.6348955035209656, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 2440 + }, + { + "epoch": 3.284182305630027, + "grad_norm": 0.5675528645515442, + "learning_rate": 0.0002, + "loss": 1.509, + "step": 2450 + }, + { + "epoch": 3.297587131367292, + "grad_norm": 0.5570188164710999, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2460 + }, + { + "epoch": 3.310991957104558, + "grad_norm": 0.6029602289199829, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 2470 + }, + { + "epoch": 3.324396782841823, + "grad_norm": 0.523206353187561, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2480 + }, + { + "epoch": 3.3378016085790883, + "grad_norm": 0.5912408828735352, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2490 + }, + { + "epoch": 3.351206434316354, + "grad_norm": 0.5524865984916687, + "learning_rate": 0.0002, + "loss": 1.5097, + "step": 2500 + }, + { + "epoch": 3.3646112600536195, + "grad_norm": 0.60386061668396, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 2510 + }, + { + "epoch": 3.3780160857908847, + "grad_norm": 0.5838595628738403, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2520 + }, + { + "epoch": 3.39142091152815, + "grad_norm": 0.5400974154472351, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2530 + }, + { + "epoch": 3.4048257372654156, + "grad_norm": 0.6150162220001221, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 2540 + }, + { + "epoch": 3.418230563002681, + "grad_norm": 0.5279412269592285, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2550 + }, + { + "epoch": 3.4316353887399464, + "grad_norm": 0.5974063873291016, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 2560 + }, + { + "epoch": 3.4450402144772116, + "grad_norm": 0.661573052406311, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2570 + }, + { + "epoch": 3.4584450402144773, + "grad_norm": 0.577880322933197, + "learning_rate": 0.0002, + "loss": 1.5204, + "step": 2580 + }, + { + "epoch": 3.4718498659517425, + "grad_norm": 0.5532318949699402, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 2590 + }, + { + "epoch": 3.485254691689008, + "grad_norm": 0.5764921307563782, + "learning_rate": 0.0002, + "loss": 1.4933, + "step": 2600 + }, + { + "epoch": 3.4986595174262733, + "grad_norm": 0.6145682334899902, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2610 + }, + { + "epoch": 3.512064343163539, + "grad_norm": 0.6561126112937927, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 2620 + }, + { + "epoch": 3.525469168900804, + "grad_norm": 0.5673288106918335, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2630 + }, + { + "epoch": 3.53887399463807, + "grad_norm": 0.6215338706970215, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 2640 + }, + { + "epoch": 3.552278820375335, + "grad_norm": 0.5512040853500366, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2650 + }, + { + "epoch": 3.5656836461126007, + "grad_norm": 0.49503496289253235, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 2660 + }, + { + "epoch": 3.579088471849866, + "grad_norm": 0.5714912414550781, + "learning_rate": 0.0002, + "loss": 1.524, + "step": 2670 + }, + { + "epoch": 3.592493297587131, + "grad_norm": 0.6883154511451721, + "learning_rate": 0.0002, + "loss": 1.4651, + "step": 2680 + }, + { + "epoch": 3.6058981233243967, + "grad_norm": 0.5989556908607483, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2690 + }, + { + "epoch": 3.6193029490616624, + "grad_norm": 0.630268394947052, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 2700 + }, + { + "epoch": 3.6327077747989276, + "grad_norm": 0.5819358229637146, + "learning_rate": 0.0002, + "loss": 1.4681, + "step": 2710 + }, + { + "epoch": 3.646112600536193, + "grad_norm": 0.6102097034454346, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 2720 + }, + { + "epoch": 3.6595174262734584, + "grad_norm": 0.6858501434326172, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 2730 + }, + { + "epoch": 3.672922252010724, + "grad_norm": 0.6328608393669128, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 2740 + }, + { + "epoch": 3.6863270777479893, + "grad_norm": 0.5366981029510498, + "learning_rate": 0.0002, + "loss": 1.5211, + "step": 2750 + }, + { + "epoch": 3.6997319034852545, + "grad_norm": 0.7048938274383545, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 2760 + }, + { + "epoch": 3.71313672922252, + "grad_norm": 0.5371938347816467, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2770 + }, + { + "epoch": 3.726541554959786, + "grad_norm": 0.6142212152481079, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 2780 + }, + { + "epoch": 3.739946380697051, + "grad_norm": 0.6164522171020508, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 2790 + }, + { + "epoch": 3.753351206434316, + "grad_norm": 0.7511836886405945, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 2800 + }, + { + "epoch": 3.766756032171582, + "grad_norm": 0.6194717288017273, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2810 + }, + { + "epoch": 3.780160857908847, + "grad_norm": 0.676721453666687, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2820 + }, + { + "epoch": 3.7935656836461127, + "grad_norm": 0.5646911263465881, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 2830 + }, + { + "epoch": 3.806970509383378, + "grad_norm": 0.5874826908111572, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 2840 + }, + { + "epoch": 3.8203753351206435, + "grad_norm": 0.6395232677459717, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2850 + }, + { + "epoch": 3.8337801608579087, + "grad_norm": 0.624563992023468, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2860 + }, + { + "epoch": 3.8471849865951744, + "grad_norm": 0.59019935131073, + "learning_rate": 0.0002, + "loss": 1.479, + "step": 2870 + }, + { + "epoch": 3.8605898123324396, + "grad_norm": 0.6700479984283447, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2880 + }, + { + "epoch": 3.8739946380697052, + "grad_norm": 0.6131282448768616, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 2890 + }, + { + "epoch": 3.8873994638069704, + "grad_norm": 0.6807777881622314, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 2900 + }, + { + "epoch": 3.900804289544236, + "grad_norm": 0.5297217965126038, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2910 + }, + { + "epoch": 3.9142091152815013, + "grad_norm": 0.5795540809631348, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2920 + }, + { + "epoch": 3.927613941018767, + "grad_norm": 0.5549747347831726, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 2930 + }, + { + "epoch": 3.941018766756032, + "grad_norm": 0.5895092487335205, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 2940 + }, + { + "epoch": 3.9544235924932973, + "grad_norm": 0.590002715587616, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2950 + }, + { + "epoch": 3.967828418230563, + "grad_norm": 0.7847695350646973, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 2960 + }, + { + "epoch": 3.9812332439678286, + "grad_norm": 0.5845848321914673, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 2970 + }, + { + "epoch": 3.994638069705094, + "grad_norm": 0.5861571431159973, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2980 + }, + { + "epoch": 4.0, + "eval_loss": 1.8821998834609985, + "eval_runtime": 90.8701, + "eval_samples_per_second": 5.667, + "eval_steps_per_second": 0.715, + "step": 2984 + }, + { + "epoch": 4.008042895442359, + "grad_norm": 0.6209918260574341, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 2990 + }, + { + "epoch": 4.021447721179625, + "grad_norm": 0.607226550579071, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3000 + }, + { + "epoch": 4.03485254691689, + "grad_norm": 0.6677961349487305, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 3010 + }, + { + "epoch": 4.048257372654155, + "grad_norm": 0.9053248763084412, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 3020 + }, + { + "epoch": 4.061662198391421, + "grad_norm": 0.6815084218978882, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3030 + }, + { + "epoch": 4.075067024128686, + "grad_norm": 0.6709407567977905, + "learning_rate": 0.0002, + "loss": 1.3, + "step": 3040 + }, + { + "epoch": 4.088471849865952, + "grad_norm": 0.728184163570404, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 3050 + }, + { + "epoch": 4.101876675603217, + "grad_norm": 0.817628800868988, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3060 + }, + { + "epoch": 4.115281501340482, + "grad_norm": 0.7384206056594849, + "learning_rate": 0.0002, + "loss": 1.3496, + "step": 3070 + }, + { + "epoch": 4.128686327077748, + "grad_norm": 0.7380280494689941, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3080 + }, + { + "epoch": 4.142091152815014, + "grad_norm": 0.8197277188301086, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3090 + }, + { + "epoch": 4.1554959785522785, + "grad_norm": 0.8971617817878723, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3100 + }, + { + "epoch": 4.168900804289544, + "grad_norm": 0.7409387826919556, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 3110 + }, + { + "epoch": 4.18230563002681, + "grad_norm": 0.6948909163475037, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3120 + }, + { + "epoch": 4.195710455764075, + "grad_norm": 0.7619595527648926, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3130 + }, + { + "epoch": 4.20911528150134, + "grad_norm": 0.7657106518745422, + "learning_rate": 0.0002, + "loss": 1.3864, + "step": 3140 + }, + { + "epoch": 4.222520107238606, + "grad_norm": 0.6919401288032532, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 3150 + }, + { + "epoch": 4.2359249329758715, + "grad_norm": 0.6991415023803711, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3160 + }, + { + "epoch": 4.249329758713137, + "grad_norm": 0.7349252700805664, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3170 + }, + { + "epoch": 4.262734584450402, + "grad_norm": 0.8838240504264832, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3180 + }, + { + "epoch": 4.2761394101876675, + "grad_norm": 0.7240107655525208, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3190 + }, + { + "epoch": 4.289544235924933, + "grad_norm": 0.7338636517524719, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3200 + }, + { + "epoch": 4.302949061662199, + "grad_norm": 0.7891436815261841, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3210 + }, + { + "epoch": 4.316353887399464, + "grad_norm": 0.7407845854759216, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 3220 + }, + { + "epoch": 4.329758713136729, + "grad_norm": 0.7635948061943054, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3230 + }, + { + "epoch": 4.343163538873995, + "grad_norm": 0.7478461861610413, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 3240 + }, + { + "epoch": 4.35656836461126, + "grad_norm": 0.7684298157691956, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 3250 + }, + { + "epoch": 4.369973190348525, + "grad_norm": 1.0287525653839111, + "learning_rate": 0.0002, + "loss": 1.4233, + "step": 3260 + }, + { + "epoch": 4.383378016085791, + "grad_norm": 0.750616192817688, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3270 + }, + { + "epoch": 4.396782841823057, + "grad_norm": 0.7911648750305176, + "learning_rate": 0.0002, + "loss": 1.3158, + "step": 3280 + }, + { + "epoch": 4.410187667560321, + "grad_norm": 0.9156750440597534, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 3290 + }, + { + "epoch": 4.423592493297587, + "grad_norm": 1.0180249214172363, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 3300 + }, + { + "epoch": 4.436997319034853, + "grad_norm": 1.0792218446731567, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 3310 + }, + { + "epoch": 4.450402144772118, + "grad_norm": 0.8027488589286804, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 3320 + }, + { + "epoch": 4.463806970509383, + "grad_norm": 0.8037815093994141, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3330 + }, + { + "epoch": 4.477211796246649, + "grad_norm": 0.7907946705818176, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3340 + }, + { + "epoch": 4.490616621983914, + "grad_norm": 0.7206302881240845, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3350 + }, + { + "epoch": 4.50402144772118, + "grad_norm": 0.7697674632072449, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 3360 + }, + { + "epoch": 4.517426273458445, + "grad_norm": 0.7315130829811096, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 3370 + }, + { + "epoch": 4.53083109919571, + "grad_norm": 0.7896273136138916, + "learning_rate": 0.0002, + "loss": 1.3598, + "step": 3380 + }, + { + "epoch": 4.544235924932976, + "grad_norm": 0.7720345258712769, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 3390 + }, + { + "epoch": 4.557640750670242, + "grad_norm": 0.8304631114006042, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 3400 + }, + { + "epoch": 4.571045576407506, + "grad_norm": 0.7408214211463928, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 3410 + }, + { + "epoch": 4.584450402144772, + "grad_norm": 0.8100157976150513, + "learning_rate": 0.0002, + "loss": 1.3957, + "step": 3420 + }, + { + "epoch": 4.597855227882038, + "grad_norm": 0.7829574942588806, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 3430 + }, + { + "epoch": 4.6112600536193025, + "grad_norm": 0.9529728889465332, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3440 + }, + { + "epoch": 4.624664879356568, + "grad_norm": 1.0769460201263428, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 3450 + }, + { + "epoch": 4.638069705093834, + "grad_norm": 0.8941947817802429, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3460 + }, + { + "epoch": 4.651474530831099, + "grad_norm": 0.7860096096992493, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 3470 + }, + { + "epoch": 4.664879356568365, + "grad_norm": 0.8184044361114502, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 3480 + }, + { + "epoch": 4.67828418230563, + "grad_norm": 0.7852717638015747, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3490 + }, + { + "epoch": 4.6916890080428955, + "grad_norm": 0.750586986541748, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3500 + }, + { + "epoch": 4.705093833780161, + "grad_norm": 0.7966068983078003, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 3510 + }, + { + "epoch": 4.718498659517426, + "grad_norm": 0.8387030959129333, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 3520 + }, + { + "epoch": 4.7319034852546915, + "grad_norm": 0.7373180389404297, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3530 + }, + { + "epoch": 4.745308310991957, + "grad_norm": 0.8415353894233704, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 3540 + }, + { + "epoch": 4.758713136729223, + "grad_norm": 0.7155488133430481, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 3550 + }, + { + "epoch": 4.772117962466488, + "grad_norm": 0.697658896446228, + "learning_rate": 0.0002, + "loss": 1.3454, + "step": 3560 + }, + { + "epoch": 4.785522788203753, + "grad_norm": 0.8722999095916748, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3570 + }, + { + "epoch": 4.798927613941019, + "grad_norm": 0.8106381297111511, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3580 + }, + { + "epoch": 4.8123324396782845, + "grad_norm": 0.9320500493049622, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 3590 + }, + { + "epoch": 4.825737265415549, + "grad_norm": 0.7583016157150269, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3600 + }, + { + "epoch": 4.839142091152815, + "grad_norm": 0.790050208568573, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3610 + }, + { + "epoch": 4.8525469168900806, + "grad_norm": 0.7481580972671509, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3620 + }, + { + "epoch": 4.865951742627346, + "grad_norm": 0.8709374666213989, + "learning_rate": 0.0002, + "loss": 1.4424, + "step": 3630 + }, + { + "epoch": 4.879356568364611, + "grad_norm": 0.7266733050346375, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 3640 + }, + { + "epoch": 4.892761394101877, + "grad_norm": 0.7669504880905151, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3650 + }, + { + "epoch": 4.906166219839142, + "grad_norm": 0.7855764627456665, + "learning_rate": 0.0002, + "loss": 1.3956, + "step": 3660 + }, + { + "epoch": 4.919571045576408, + "grad_norm": 0.8145440816879272, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3670 + }, + { + "epoch": 4.932975871313673, + "grad_norm": 0.7487278580665588, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 3680 + }, + { + "epoch": 4.946380697050938, + "grad_norm": 0.8390981554985046, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 3690 + }, + { + "epoch": 4.959785522788204, + "grad_norm": 0.663752555847168, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 3700 + }, + { + "epoch": 4.973190348525469, + "grad_norm": 0.7821969985961914, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3710 + }, + { + "epoch": 4.986595174262734, + "grad_norm": 0.9157266020774841, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 3720 + }, + { + "epoch": 5.0, + "grad_norm": 0.7683535814285278, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3730 + }, + { + "epoch": 5.0, + "eval_loss": 1.9639414548873901, + "eval_runtime": 92.0173, + "eval_samples_per_second": 5.597, + "eval_steps_per_second": 0.706, + "step": 3730 + }, + { + "epoch": 5.013404825737266, + "grad_norm": 1.3000373840332031, + "learning_rate": 0.0002, + "loss": 1.1852, + "step": 3740 + }, + { + "epoch": 5.02680965147453, + "grad_norm": 0.8916982412338257, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 3750 + }, + { + "epoch": 5.040214477211796, + "grad_norm": 1.0365116596221924, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 3760 + }, + { + "epoch": 5.053619302949062, + "grad_norm": 0.999420166015625, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 3770 + }, + { + "epoch": 5.067024128686327, + "grad_norm": 1.093572974205017, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 3780 + }, + { + "epoch": 5.080428954423592, + "grad_norm": 1.1137515306472778, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3790 + }, + { + "epoch": 5.093833780160858, + "grad_norm": 1.0328283309936523, + "learning_rate": 0.0002, + "loss": 1.1646, + "step": 3800 + }, + { + "epoch": 5.107238605898123, + "grad_norm": 1.0444108247756958, + "learning_rate": 0.0002, + "loss": 1.1716, + "step": 3810 + }, + { + "epoch": 5.120643431635389, + "grad_norm": 0.858148992061615, + "learning_rate": 0.0002, + "loss": 1.2226, + "step": 3820 + }, + { + "epoch": 5.134048257372654, + "grad_norm": 0.94026780128479, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 3830 + }, + { + "epoch": 5.1474530831099194, + "grad_norm": 0.8987152576446533, + "learning_rate": 0.0002, + "loss": 1.1902, + "step": 3840 + }, + { + "epoch": 5.160857908847185, + "grad_norm": 0.922997236251831, + "learning_rate": 0.0002, + "loss": 1.1562, + "step": 3850 + }, + { + "epoch": 5.174262734584451, + "grad_norm": 0.9172422289848328, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 3860 + }, + { + "epoch": 5.1876675603217155, + "grad_norm": 1.02277672290802, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 3870 + }, + { + "epoch": 5.201072386058981, + "grad_norm": 1.093826413154602, + "learning_rate": 0.0002, + "loss": 1.2206, + "step": 3880 + }, + { + "epoch": 5.214477211796247, + "grad_norm": 0.9362447261810303, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 3890 + }, + { + "epoch": 5.227882037533512, + "grad_norm": 1.0564044713974, + "learning_rate": 0.0002, + "loss": 1.2335, + "step": 3900 + }, + { + "epoch": 5.241286863270777, + "grad_norm": 0.869575023651123, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 3910 + }, + { + "epoch": 5.254691689008043, + "grad_norm": 1.0383203029632568, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 3920 + }, + { + "epoch": 5.2680965147453085, + "grad_norm": 0.9146919846534729, + "learning_rate": 0.0002, + "loss": 1.2076, + "step": 3930 + }, + { + "epoch": 5.281501340482574, + "grad_norm": 0.9226430654525757, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 3940 + }, + { + "epoch": 5.294906166219839, + "grad_norm": 0.8703194260597229, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 3950 + }, + { + "epoch": 5.3083109919571045, + "grad_norm": 1.0588284730911255, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 3960 + }, + { + "epoch": 5.32171581769437, + "grad_norm": 1.1131688356399536, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 3970 + }, + { + "epoch": 5.335120643431635, + "grad_norm": 1.1073139905929565, + "learning_rate": 0.0002, + "loss": 1.1719, + "step": 3980 + }, + { + "epoch": 5.348525469168901, + "grad_norm": 0.9269049763679504, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 3990 + }, + { + "epoch": 5.361930294906166, + "grad_norm": 0.9802212715148926, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 4000 + }, + { + "epoch": 5.375335120643432, + "grad_norm": 0.9152148365974426, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 4010 + }, + { + "epoch": 5.388739946380697, + "grad_norm": 1.0395890474319458, + "learning_rate": 0.0002, + "loss": 1.2673, + "step": 4020 + }, + { + "epoch": 5.402144772117962, + "grad_norm": 1.0989106893539429, + "learning_rate": 0.0002, + "loss": 1.2228, + "step": 4030 + }, + { + "epoch": 5.415549597855228, + "grad_norm": 1.0305225849151611, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 4040 + }, + { + "epoch": 5.428954423592494, + "grad_norm": 0.8416915535926819, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 4050 + }, + { + "epoch": 5.442359249329758, + "grad_norm": 0.9120758175849915, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 4060 + }, + { + "epoch": 5.455764075067024, + "grad_norm": 1.197936773300171, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 4070 + }, + { + "epoch": 5.46916890080429, + "grad_norm": 1.0116125345230103, + "learning_rate": 0.0002, + "loss": 1.2346, + "step": 4080 + }, + { + "epoch": 5.482573726541555, + "grad_norm": 1.048995018005371, + "learning_rate": 0.0002, + "loss": 1.1746, + "step": 4090 + }, + { + "epoch": 5.49597855227882, + "grad_norm": 0.929185152053833, + "learning_rate": 0.0002, + "loss": 1.1858, + "step": 4100 + }, + { + "epoch": 5.509383378016086, + "grad_norm": 0.9064884781837463, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 4110 + }, + { + "epoch": 5.522788203753351, + "grad_norm": 1.2009892463684082, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 4120 + }, + { + "epoch": 5.536193029490617, + "grad_norm": 0.9054455161094666, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 4130 + }, + { + "epoch": 5.549597855227882, + "grad_norm": 0.9978497624397278, + "learning_rate": 0.0002, + "loss": 1.1624, + "step": 4140 + }, + { + "epoch": 5.563002680965147, + "grad_norm": 0.9779615998268127, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4150 + }, + { + "epoch": 5.576407506702413, + "grad_norm": 1.0515185594558716, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 4160 + }, + { + "epoch": 5.589812332439678, + "grad_norm": 0.8618236184120178, + "learning_rate": 0.0002, + "loss": 1.2278, + "step": 4170 + }, + { + "epoch": 5.603217158176943, + "grad_norm": 0.9569384455680847, + "learning_rate": 0.0002, + "loss": 1.2853, + "step": 4180 + }, + { + "epoch": 5.616621983914209, + "grad_norm": 0.968923807144165, + "learning_rate": 0.0002, + "loss": 1.2824, + "step": 4190 + }, + { + "epoch": 5.630026809651475, + "grad_norm": 0.8759993314743042, + "learning_rate": 0.0002, + "loss": 1.3055, + "step": 4200 + }, + { + "epoch": 5.64343163538874, + "grad_norm": 0.9284833669662476, + "learning_rate": 0.0002, + "loss": 1.2912, + "step": 4210 + }, + { + "epoch": 5.656836461126005, + "grad_norm": 0.9293071031570435, + "learning_rate": 0.0002, + "loss": 1.2886, + "step": 4220 + }, + { + "epoch": 5.670241286863271, + "grad_norm": 0.9872161149978638, + "learning_rate": 0.0002, + "loss": 1.2704, + "step": 4230 + }, + { + "epoch": 5.683646112600536, + "grad_norm": 0.9545941948890686, + "learning_rate": 0.0002, + "loss": 1.2525, + "step": 4240 + }, + { + "epoch": 5.697050938337801, + "grad_norm": 1.0202341079711914, + "learning_rate": 0.0002, + "loss": 1.2639, + "step": 4250 + }, + { + "epoch": 5.710455764075067, + "grad_norm": 0.9821504950523376, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 4260 + }, + { + "epoch": 5.7238605898123325, + "grad_norm": 1.0581456422805786, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 4270 + }, + { + "epoch": 5.737265415549598, + "grad_norm": 0.9639395475387573, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 4280 + }, + { + "epoch": 5.750670241286863, + "grad_norm": 2.205458164215088, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 4290 + }, + { + "epoch": 5.7640750670241285, + "grad_norm": 1.0294393301010132, + "learning_rate": 0.0002, + "loss": 1.2785, + "step": 4300 + }, + { + "epoch": 5.777479892761394, + "grad_norm": 1.0360256433486938, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 4310 + }, + { + "epoch": 5.79088471849866, + "grad_norm": 0.9390154480934143, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4320 + }, + { + "epoch": 5.804289544235925, + "grad_norm": 0.9048963189125061, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 4330 + }, + { + "epoch": 5.81769436997319, + "grad_norm": 0.9310713410377502, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 4340 + }, + { + "epoch": 5.831099195710456, + "grad_norm": 1.038282871246338, + "learning_rate": 0.0002, + "loss": 1.2393, + "step": 4350 + }, + { + "epoch": 5.8445040214477215, + "grad_norm": 0.9194827079772949, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 4360 + }, + { + "epoch": 5.857908847184986, + "grad_norm": 0.9568411111831665, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 4370 + }, + { + "epoch": 5.871313672922252, + "grad_norm": 0.9088910818099976, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 4380 + }, + { + "epoch": 5.884718498659518, + "grad_norm": 1.0605647563934326, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 4390 + }, + { + "epoch": 5.898123324396783, + "grad_norm": 0.8016388416290283, + "learning_rate": 0.0002, + "loss": 1.2387, + "step": 4400 + }, + { + "epoch": 5.911528150134048, + "grad_norm": 1.0792853832244873, + "learning_rate": 0.0002, + "loss": 1.3046, + "step": 4410 + }, + { + "epoch": 5.924932975871314, + "grad_norm": 1.059403657913208, + "learning_rate": 0.0002, + "loss": 1.282, + "step": 4420 + }, + { + "epoch": 5.938337801608579, + "grad_norm": 0.87492436170578, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 4430 + }, + { + "epoch": 5.951742627345844, + "grad_norm": 1.0911097526550293, + "learning_rate": 0.0002, + "loss": 1.2373, + "step": 4440 + }, + { + "epoch": 5.96514745308311, + "grad_norm": 0.8860997557640076, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 4450 + }, + { + "epoch": 5.978552278820375, + "grad_norm": 0.9176826477050781, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 4460 + }, + { + "epoch": 5.991957104557641, + "grad_norm": 0.9018680453300476, + "learning_rate": 0.0002, + "loss": 1.2725, + "step": 4470 + }, + { + "epoch": 6.0, + "eval_loss": 2.0600433349609375, + "eval_runtime": 92.2728, + "eval_samples_per_second": 5.581, + "eval_steps_per_second": 0.704, + "step": 4476 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.071392578300805e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-4476/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8fe140a5fb94123a1115bbdaa365523bb11c0cf9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0fcbfccb3644787458f909e9cc3574d9f720fa9a4ff8328ce6a75edd005cdb7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..241f19b0f7073da8437e743d6e947621f77511fe --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e53aaade393dec7d348a934246b3def9dfd7e070e9832d698c09467e37b5fdcd +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f3f57ed9d0cc88f28b70082fb130876a29bfc2f3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c268ee448484e3d92f4132d1ce91d89da9498cc05df06f212cd6a62ba2951382 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a16701ac9eaba5c16c67f6b8de5d65c625786f6b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ef0d31c8c22184e2c3b508b6bd346903221dca863552a3e7bd214cc5122b82d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c686e41001455eb2525f04b07315b6db7279fdf0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/trainer_state.json @@ -0,0 +1,3743 @@ +{ + "best_metric": 1.8150336742401123, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 5222, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.2998153269290924, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 750 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.34353747963905334, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 760 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.3506847321987152, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 770 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.3434218764305115, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 780 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.39283573627471924, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 790 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.36534103751182556, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 800 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.32713210582733154, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 810 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.4298870861530304, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 820 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.3652895987033844, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 830 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.4341593086719513, + "learning_rate": 0.0002, + "loss": 1.7952, + "step": 840 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.3925093412399292, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 850 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 860 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.36138468980789185, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 870 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.33074072003364563, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 880 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.3552579879760742, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 890 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.38744238018989563, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 900 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.3563305735588074, + "learning_rate": 0.0002, + "loss": 1.7543, + "step": 910 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.35686084628105164, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 920 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.4001927077770233, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 930 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.35909149050712585, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 940 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.35123375058174133, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 950 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.38013333082199097, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 960 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 970 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.4208183288574219, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 980 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.3613564074039459, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 990 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.34058499336242676, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1000 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.3563075065612793, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1010 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.36920854449272156, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 1020 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.3889519274234772, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1030 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.3664555251598358, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1040 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.38175567984580994, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1050 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.42346763610839844, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1060 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.3456033170223236, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1070 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.38931941986083984, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1080 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.5473279356956482, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1090 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.3517422676086426, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 1100 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.3511943221092224, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1110 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.3762837052345276, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 1120 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.37149128317832947, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1130 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.3945842981338501, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 1140 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.40258195996284485, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1150 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.3959120213985443, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 1160 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.37792712450027466, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 1170 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.4019201099872589, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1180 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.40712273120880127, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1190 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.4131423234939575, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 1200 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.3738194704055786, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1210 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.3987765908241272, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.34117406606674194, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1230 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.34900516271591187, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 1240 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.35759788751602173, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 1250 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.3837822377681732, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1260 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.3671180307865143, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1270 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.4124658703804016, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 1280 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.39059901237487793, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 1290 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.4006287157535553, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1300 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.3606216013431549, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 1310 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.3861924111843109, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1320 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.41432589292526245, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1330 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.3751705586910248, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 1340 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.36217355728149414, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1350 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.35937434434890747, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1360 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.36120304465293884, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 1370 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.36082401871681213, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 1380 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.3616413176059723, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 1390 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.3664911091327667, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1400 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.3545122444629669, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1410 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.38186976313591003, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1420 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.41099944710731506, + "learning_rate": 0.0002, + "loss": 1.788, + "step": 1430 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.34538620710372925, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1440 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.35443663597106934, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 1450 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.4783519208431244, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1460 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.36285310983657837, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1470 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.361730694770813, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 1480 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.38347867131233215, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8150336742401123, + "eval_runtime": 91.1797, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 0.713, + "step": 1492 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 0.3648935854434967, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1500 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 0.3521469533443451, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 1510 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 0.4275520145893097, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 1520 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 0.4140888750553131, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1530 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 0.37715452909469604, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 1540 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 0.4375513195991516, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 1550 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 0.44963088631629944, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1560 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 0.45463916659355164, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 1570 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 0.3952806293964386, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1580 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 0.44873616099357605, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1590 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 0.45529067516326904, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1600 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 0.4483625590801239, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1610 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 0.3954690992832184, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1620 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 0.4297006130218506, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 1630 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 0.4121869206428528, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 1640 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 0.45843517780303955, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1650 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 0.44742295145988464, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1660 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 0.500198483467102, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 1670 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 0.4322265386581421, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 1680 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 0.480289101600647, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1690 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 0.4532500207424164, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 1700 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 0.41848474740982056, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1710 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 0.47211962938308716, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 1720 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 0.4273032248020172, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1730 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 0.4660373330116272, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1740 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 0.4409862756729126, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 1750 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 0.44795849919319153, + "learning_rate": 0.0002, + "loss": 1.6579, + "step": 1760 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 0.4470100402832031, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 1770 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 0.4184521436691284, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1780 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 0.4572308659553528, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1790 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 0.4888782501220703, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 1800 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 0.4442083239555359, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1810 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 0.4986329972743988, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1820 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 0.47918054461479187, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1830 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 0.42569679021835327, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1840 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 0.4683821201324463, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 1850 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 0.43605074286460876, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 1860 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 0.4189167618751526, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 1870 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 0.5860861539840698, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1880 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 0.4568740427494049, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1890 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 0.4672846496105194, + "learning_rate": 0.0002, + "loss": 1.6653, + "step": 1900 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 0.4280472993850708, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1910 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 0.590728759765625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 1920 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 0.4205126166343689, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1930 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 0.47869905829429626, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 1940 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 0.4607323408126831, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 1950 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 0.4762210547924042, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1960 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 0.46832647919654846, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1970 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 0.4368574619293213, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 1980 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 0.5248273611068726, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 1990 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 0.46777117252349854, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2000 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 0.5201858878135681, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 0.46777284145355225, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 2020 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 0.46736642718315125, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2030 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 0.4647925794124603, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2040 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 0.4298803508281708, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 2050 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 0.45485609769821167, + "learning_rate": 0.0002, + "loss": 1.6648, + "step": 2060 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 0.43687865138053894, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2070 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 0.4319164752960205, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 2080 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 0.47792428731918335, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 2090 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 0.5322234034538269, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2100 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 0.47517943382263184, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 2110 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 0.45799025893211365, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 2120 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 0.45852357149124146, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2130 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 0.4617408514022827, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2140 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 0.44205963611602783, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2150 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 0.47173425555229187, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2160 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 0.46379899978637695, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 2170 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 0.4999759793281555, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2180 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 0.4607947766780853, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 2190 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 0.4359836280345917, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 2200 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 0.5195549726486206, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 2210 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 0.4914056062698364, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2220 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 0.4647377133369446, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2230 + }, + { + "epoch": 3.0, + "eval_loss": 1.8368606567382812, + "eval_runtime": 90.5623, + "eval_samples_per_second": 5.687, + "eval_steps_per_second": 0.718, + "step": 2238 + }, + { + "epoch": 3.002680965147453, + "grad_norm": 0.40689945220947266, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 2240 + }, + { + "epoch": 3.0160857908847185, + "grad_norm": 0.4699273705482483, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 2250 + }, + { + "epoch": 3.0294906166219837, + "grad_norm": 0.5531830787658691, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 2260 + }, + { + "epoch": 3.0428954423592494, + "grad_norm": 0.5441790223121643, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 2270 + }, + { + "epoch": 3.0563002680965146, + "grad_norm": 0.6145012974739075, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2280 + }, + { + "epoch": 3.06970509383378, + "grad_norm": 0.6997102499008179, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 2290 + }, + { + "epoch": 3.0831099195710454, + "grad_norm": 0.6082330942153931, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2300 + }, + { + "epoch": 3.096514745308311, + "grad_norm": 0.5294155478477478, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 2310 + }, + { + "epoch": 3.1099195710455763, + "grad_norm": 0.7200340032577515, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2320 + }, + { + "epoch": 3.123324396782842, + "grad_norm": 0.721092939376831, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 2330 + }, + { + "epoch": 3.136729222520107, + "grad_norm": 0.5344305038452148, + "learning_rate": 0.0002, + "loss": 1.5307, + "step": 2340 + }, + { + "epoch": 3.1501340482573728, + "grad_norm": 0.5533145070075989, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2350 + }, + { + "epoch": 3.163538873994638, + "grad_norm": 0.5976856350898743, + "learning_rate": 0.0002, + "loss": 1.529, + "step": 2360 + }, + { + "epoch": 3.1769436997319036, + "grad_norm": 0.4974960386753082, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2370 + }, + { + "epoch": 3.190348525469169, + "grad_norm": 0.6377840042114258, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 2380 + }, + { + "epoch": 3.2037533512064345, + "grad_norm": 0.5447293519973755, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2390 + }, + { + "epoch": 3.2171581769436997, + "grad_norm": 0.49577030539512634, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2400 + }, + { + "epoch": 3.2305630026809653, + "grad_norm": 0.5588275790214539, + "learning_rate": 0.0002, + "loss": 1.4768, + "step": 2410 + }, + { + "epoch": 3.2439678284182305, + "grad_norm": 0.6429149508476257, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 2420 + }, + { + "epoch": 3.257372654155496, + "grad_norm": 0.5713154673576355, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2430 + }, + { + "epoch": 3.2707774798927614, + "grad_norm": 0.6348955035209656, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 2440 + }, + { + "epoch": 3.284182305630027, + "grad_norm": 0.5675528645515442, + "learning_rate": 0.0002, + "loss": 1.509, + "step": 2450 + }, + { + "epoch": 3.297587131367292, + "grad_norm": 0.5570188164710999, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2460 + }, + { + "epoch": 3.310991957104558, + "grad_norm": 0.6029602289199829, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 2470 + }, + { + "epoch": 3.324396782841823, + "grad_norm": 0.523206353187561, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2480 + }, + { + "epoch": 3.3378016085790883, + "grad_norm": 0.5912408828735352, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2490 + }, + { + "epoch": 3.351206434316354, + "grad_norm": 0.5524865984916687, + "learning_rate": 0.0002, + "loss": 1.5097, + "step": 2500 + }, + { + "epoch": 3.3646112600536195, + "grad_norm": 0.60386061668396, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 2510 + }, + { + "epoch": 3.3780160857908847, + "grad_norm": 0.5838595628738403, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2520 + }, + { + "epoch": 3.39142091152815, + "grad_norm": 0.5400974154472351, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2530 + }, + { + "epoch": 3.4048257372654156, + "grad_norm": 0.6150162220001221, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 2540 + }, + { + "epoch": 3.418230563002681, + "grad_norm": 0.5279412269592285, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2550 + }, + { + "epoch": 3.4316353887399464, + "grad_norm": 0.5974063873291016, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 2560 + }, + { + "epoch": 3.4450402144772116, + "grad_norm": 0.661573052406311, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2570 + }, + { + "epoch": 3.4584450402144773, + "grad_norm": 0.577880322933197, + "learning_rate": 0.0002, + "loss": 1.5204, + "step": 2580 + }, + { + "epoch": 3.4718498659517425, + "grad_norm": 0.5532318949699402, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 2590 + }, + { + "epoch": 3.485254691689008, + "grad_norm": 0.5764921307563782, + "learning_rate": 0.0002, + "loss": 1.4933, + "step": 2600 + }, + { + "epoch": 3.4986595174262733, + "grad_norm": 0.6145682334899902, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2610 + }, + { + "epoch": 3.512064343163539, + "grad_norm": 0.6561126112937927, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 2620 + }, + { + "epoch": 3.525469168900804, + "grad_norm": 0.5673288106918335, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2630 + }, + { + "epoch": 3.53887399463807, + "grad_norm": 0.6215338706970215, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 2640 + }, + { + "epoch": 3.552278820375335, + "grad_norm": 0.5512040853500366, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2650 + }, + { + "epoch": 3.5656836461126007, + "grad_norm": 0.49503496289253235, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 2660 + }, + { + "epoch": 3.579088471849866, + "grad_norm": 0.5714912414550781, + "learning_rate": 0.0002, + "loss": 1.524, + "step": 2670 + }, + { + "epoch": 3.592493297587131, + "grad_norm": 0.6883154511451721, + "learning_rate": 0.0002, + "loss": 1.4651, + "step": 2680 + }, + { + "epoch": 3.6058981233243967, + "grad_norm": 0.5989556908607483, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2690 + }, + { + "epoch": 3.6193029490616624, + "grad_norm": 0.630268394947052, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 2700 + }, + { + "epoch": 3.6327077747989276, + "grad_norm": 0.5819358229637146, + "learning_rate": 0.0002, + "loss": 1.4681, + "step": 2710 + }, + { + "epoch": 3.646112600536193, + "grad_norm": 0.6102097034454346, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 2720 + }, + { + "epoch": 3.6595174262734584, + "grad_norm": 0.6858501434326172, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 2730 + }, + { + "epoch": 3.672922252010724, + "grad_norm": 0.6328608393669128, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 2740 + }, + { + "epoch": 3.6863270777479893, + "grad_norm": 0.5366981029510498, + "learning_rate": 0.0002, + "loss": 1.5211, + "step": 2750 + }, + { + "epoch": 3.6997319034852545, + "grad_norm": 0.7048938274383545, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 2760 + }, + { + "epoch": 3.71313672922252, + "grad_norm": 0.5371938347816467, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2770 + }, + { + "epoch": 3.726541554959786, + "grad_norm": 0.6142212152481079, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 2780 + }, + { + "epoch": 3.739946380697051, + "grad_norm": 0.6164522171020508, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 2790 + }, + { + "epoch": 3.753351206434316, + "grad_norm": 0.7511836886405945, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 2800 + }, + { + "epoch": 3.766756032171582, + "grad_norm": 0.6194717288017273, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2810 + }, + { + "epoch": 3.780160857908847, + "grad_norm": 0.676721453666687, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2820 + }, + { + "epoch": 3.7935656836461127, + "grad_norm": 0.5646911263465881, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 2830 + }, + { + "epoch": 3.806970509383378, + "grad_norm": 0.5874826908111572, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 2840 + }, + { + "epoch": 3.8203753351206435, + "grad_norm": 0.6395232677459717, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2850 + }, + { + "epoch": 3.8337801608579087, + "grad_norm": 0.624563992023468, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2860 + }, + { + "epoch": 3.8471849865951744, + "grad_norm": 0.59019935131073, + "learning_rate": 0.0002, + "loss": 1.479, + "step": 2870 + }, + { + "epoch": 3.8605898123324396, + "grad_norm": 0.6700479984283447, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2880 + }, + { + "epoch": 3.8739946380697052, + "grad_norm": 0.6131282448768616, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 2890 + }, + { + "epoch": 3.8873994638069704, + "grad_norm": 0.6807777881622314, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 2900 + }, + { + "epoch": 3.900804289544236, + "grad_norm": 0.5297217965126038, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2910 + }, + { + "epoch": 3.9142091152815013, + "grad_norm": 0.5795540809631348, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2920 + }, + { + "epoch": 3.927613941018767, + "grad_norm": 0.5549747347831726, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 2930 + }, + { + "epoch": 3.941018766756032, + "grad_norm": 0.5895092487335205, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 2940 + }, + { + "epoch": 3.9544235924932973, + "grad_norm": 0.590002715587616, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2950 + }, + { + "epoch": 3.967828418230563, + "grad_norm": 0.7847695350646973, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 2960 + }, + { + "epoch": 3.9812332439678286, + "grad_norm": 0.5845848321914673, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 2970 + }, + { + "epoch": 3.994638069705094, + "grad_norm": 0.5861571431159973, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2980 + }, + { + "epoch": 4.0, + "eval_loss": 1.8821998834609985, + "eval_runtime": 90.8701, + "eval_samples_per_second": 5.667, + "eval_steps_per_second": 0.715, + "step": 2984 + }, + { + "epoch": 4.008042895442359, + "grad_norm": 0.6209918260574341, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 2990 + }, + { + "epoch": 4.021447721179625, + "grad_norm": 0.607226550579071, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3000 + }, + { + "epoch": 4.03485254691689, + "grad_norm": 0.6677961349487305, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 3010 + }, + { + "epoch": 4.048257372654155, + "grad_norm": 0.9053248763084412, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 3020 + }, + { + "epoch": 4.061662198391421, + "grad_norm": 0.6815084218978882, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3030 + }, + { + "epoch": 4.075067024128686, + "grad_norm": 0.6709407567977905, + "learning_rate": 0.0002, + "loss": 1.3, + "step": 3040 + }, + { + "epoch": 4.088471849865952, + "grad_norm": 0.728184163570404, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 3050 + }, + { + "epoch": 4.101876675603217, + "grad_norm": 0.817628800868988, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3060 + }, + { + "epoch": 4.115281501340482, + "grad_norm": 0.7384206056594849, + "learning_rate": 0.0002, + "loss": 1.3496, + "step": 3070 + }, + { + "epoch": 4.128686327077748, + "grad_norm": 0.7380280494689941, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3080 + }, + { + "epoch": 4.142091152815014, + "grad_norm": 0.8197277188301086, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3090 + }, + { + "epoch": 4.1554959785522785, + "grad_norm": 0.8971617817878723, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3100 + }, + { + "epoch": 4.168900804289544, + "grad_norm": 0.7409387826919556, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 3110 + }, + { + "epoch": 4.18230563002681, + "grad_norm": 0.6948909163475037, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3120 + }, + { + "epoch": 4.195710455764075, + "grad_norm": 0.7619595527648926, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3130 + }, + { + "epoch": 4.20911528150134, + "grad_norm": 0.7657106518745422, + "learning_rate": 0.0002, + "loss": 1.3864, + "step": 3140 + }, + { + "epoch": 4.222520107238606, + "grad_norm": 0.6919401288032532, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 3150 + }, + { + "epoch": 4.2359249329758715, + "grad_norm": 0.6991415023803711, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3160 + }, + { + "epoch": 4.249329758713137, + "grad_norm": 0.7349252700805664, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3170 + }, + { + "epoch": 4.262734584450402, + "grad_norm": 0.8838240504264832, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3180 + }, + { + "epoch": 4.2761394101876675, + "grad_norm": 0.7240107655525208, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3190 + }, + { + "epoch": 4.289544235924933, + "grad_norm": 0.7338636517524719, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3200 + }, + { + "epoch": 4.302949061662199, + "grad_norm": 0.7891436815261841, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3210 + }, + { + "epoch": 4.316353887399464, + "grad_norm": 0.7407845854759216, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 3220 + }, + { + "epoch": 4.329758713136729, + "grad_norm": 0.7635948061943054, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3230 + }, + { + "epoch": 4.343163538873995, + "grad_norm": 0.7478461861610413, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 3240 + }, + { + "epoch": 4.35656836461126, + "grad_norm": 0.7684298157691956, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 3250 + }, + { + "epoch": 4.369973190348525, + "grad_norm": 1.0287525653839111, + "learning_rate": 0.0002, + "loss": 1.4233, + "step": 3260 + }, + { + "epoch": 4.383378016085791, + "grad_norm": 0.750616192817688, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3270 + }, + { + "epoch": 4.396782841823057, + "grad_norm": 0.7911648750305176, + "learning_rate": 0.0002, + "loss": 1.3158, + "step": 3280 + }, + { + "epoch": 4.410187667560321, + "grad_norm": 0.9156750440597534, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 3290 + }, + { + "epoch": 4.423592493297587, + "grad_norm": 1.0180249214172363, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 3300 + }, + { + "epoch": 4.436997319034853, + "grad_norm": 1.0792218446731567, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 3310 + }, + { + "epoch": 4.450402144772118, + "grad_norm": 0.8027488589286804, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 3320 + }, + { + "epoch": 4.463806970509383, + "grad_norm": 0.8037815093994141, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3330 + }, + { + "epoch": 4.477211796246649, + "grad_norm": 0.7907946705818176, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3340 + }, + { + "epoch": 4.490616621983914, + "grad_norm": 0.7206302881240845, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3350 + }, + { + "epoch": 4.50402144772118, + "grad_norm": 0.7697674632072449, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 3360 + }, + { + "epoch": 4.517426273458445, + "grad_norm": 0.7315130829811096, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 3370 + }, + { + "epoch": 4.53083109919571, + "grad_norm": 0.7896273136138916, + "learning_rate": 0.0002, + "loss": 1.3598, + "step": 3380 + }, + { + "epoch": 4.544235924932976, + "grad_norm": 0.7720345258712769, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 3390 + }, + { + "epoch": 4.557640750670242, + "grad_norm": 0.8304631114006042, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 3400 + }, + { + "epoch": 4.571045576407506, + "grad_norm": 0.7408214211463928, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 3410 + }, + { + "epoch": 4.584450402144772, + "grad_norm": 0.8100157976150513, + "learning_rate": 0.0002, + "loss": 1.3957, + "step": 3420 + }, + { + "epoch": 4.597855227882038, + "grad_norm": 0.7829574942588806, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 3430 + }, + { + "epoch": 4.6112600536193025, + "grad_norm": 0.9529728889465332, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3440 + }, + { + "epoch": 4.624664879356568, + "grad_norm": 1.0769460201263428, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 3450 + }, + { + "epoch": 4.638069705093834, + "grad_norm": 0.8941947817802429, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3460 + }, + { + "epoch": 4.651474530831099, + "grad_norm": 0.7860096096992493, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 3470 + }, + { + "epoch": 4.664879356568365, + "grad_norm": 0.8184044361114502, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 3480 + }, + { + "epoch": 4.67828418230563, + "grad_norm": 0.7852717638015747, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3490 + }, + { + "epoch": 4.6916890080428955, + "grad_norm": 0.750586986541748, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3500 + }, + { + "epoch": 4.705093833780161, + "grad_norm": 0.7966068983078003, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 3510 + }, + { + "epoch": 4.718498659517426, + "grad_norm": 0.8387030959129333, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 3520 + }, + { + "epoch": 4.7319034852546915, + "grad_norm": 0.7373180389404297, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3530 + }, + { + "epoch": 4.745308310991957, + "grad_norm": 0.8415353894233704, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 3540 + }, + { + "epoch": 4.758713136729223, + "grad_norm": 0.7155488133430481, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 3550 + }, + { + "epoch": 4.772117962466488, + "grad_norm": 0.697658896446228, + "learning_rate": 0.0002, + "loss": 1.3454, + "step": 3560 + }, + { + "epoch": 4.785522788203753, + "grad_norm": 0.8722999095916748, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3570 + }, + { + "epoch": 4.798927613941019, + "grad_norm": 0.8106381297111511, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3580 + }, + { + "epoch": 4.8123324396782845, + "grad_norm": 0.9320500493049622, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 3590 + }, + { + "epoch": 4.825737265415549, + "grad_norm": 0.7583016157150269, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3600 + }, + { + "epoch": 4.839142091152815, + "grad_norm": 0.790050208568573, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3610 + }, + { + "epoch": 4.8525469168900806, + "grad_norm": 0.7481580972671509, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3620 + }, + { + "epoch": 4.865951742627346, + "grad_norm": 0.8709374666213989, + "learning_rate": 0.0002, + "loss": 1.4424, + "step": 3630 + }, + { + "epoch": 4.879356568364611, + "grad_norm": 0.7266733050346375, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 3640 + }, + { + "epoch": 4.892761394101877, + "grad_norm": 0.7669504880905151, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3650 + }, + { + "epoch": 4.906166219839142, + "grad_norm": 0.7855764627456665, + "learning_rate": 0.0002, + "loss": 1.3956, + "step": 3660 + }, + { + "epoch": 4.919571045576408, + "grad_norm": 0.8145440816879272, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3670 + }, + { + "epoch": 4.932975871313673, + "grad_norm": 0.7487278580665588, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 3680 + }, + { + "epoch": 4.946380697050938, + "grad_norm": 0.8390981554985046, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 3690 + }, + { + "epoch": 4.959785522788204, + "grad_norm": 0.663752555847168, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 3700 + }, + { + "epoch": 4.973190348525469, + "grad_norm": 0.7821969985961914, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3710 + }, + { + "epoch": 4.986595174262734, + "grad_norm": 0.9157266020774841, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 3720 + }, + { + "epoch": 5.0, + "grad_norm": 0.7683535814285278, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3730 + }, + { + "epoch": 5.0, + "eval_loss": 1.9639414548873901, + "eval_runtime": 92.0173, + "eval_samples_per_second": 5.597, + "eval_steps_per_second": 0.706, + "step": 3730 + }, + { + "epoch": 5.013404825737266, + "grad_norm": 1.3000373840332031, + "learning_rate": 0.0002, + "loss": 1.1852, + "step": 3740 + }, + { + "epoch": 5.02680965147453, + "grad_norm": 0.8916982412338257, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 3750 + }, + { + "epoch": 5.040214477211796, + "grad_norm": 1.0365116596221924, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 3760 + }, + { + "epoch": 5.053619302949062, + "grad_norm": 0.999420166015625, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 3770 + }, + { + "epoch": 5.067024128686327, + "grad_norm": 1.093572974205017, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 3780 + }, + { + "epoch": 5.080428954423592, + "grad_norm": 1.1137515306472778, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3790 + }, + { + "epoch": 5.093833780160858, + "grad_norm": 1.0328283309936523, + "learning_rate": 0.0002, + "loss": 1.1646, + "step": 3800 + }, + { + "epoch": 5.107238605898123, + "grad_norm": 1.0444108247756958, + "learning_rate": 0.0002, + "loss": 1.1716, + "step": 3810 + }, + { + "epoch": 5.120643431635389, + "grad_norm": 0.858148992061615, + "learning_rate": 0.0002, + "loss": 1.2226, + "step": 3820 + }, + { + "epoch": 5.134048257372654, + "grad_norm": 0.94026780128479, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 3830 + }, + { + "epoch": 5.1474530831099194, + "grad_norm": 0.8987152576446533, + "learning_rate": 0.0002, + "loss": 1.1902, + "step": 3840 + }, + { + "epoch": 5.160857908847185, + "grad_norm": 0.922997236251831, + "learning_rate": 0.0002, + "loss": 1.1562, + "step": 3850 + }, + { + "epoch": 5.174262734584451, + "grad_norm": 0.9172422289848328, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 3860 + }, + { + "epoch": 5.1876675603217155, + "grad_norm": 1.02277672290802, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 3870 + }, + { + "epoch": 5.201072386058981, + "grad_norm": 1.093826413154602, + "learning_rate": 0.0002, + "loss": 1.2206, + "step": 3880 + }, + { + "epoch": 5.214477211796247, + "grad_norm": 0.9362447261810303, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 3890 + }, + { + "epoch": 5.227882037533512, + "grad_norm": 1.0564044713974, + "learning_rate": 0.0002, + "loss": 1.2335, + "step": 3900 + }, + { + "epoch": 5.241286863270777, + "grad_norm": 0.869575023651123, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 3910 + }, + { + "epoch": 5.254691689008043, + "grad_norm": 1.0383203029632568, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 3920 + }, + { + "epoch": 5.2680965147453085, + "grad_norm": 0.9146919846534729, + "learning_rate": 0.0002, + "loss": 1.2076, + "step": 3930 + }, + { + "epoch": 5.281501340482574, + "grad_norm": 0.9226430654525757, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 3940 + }, + { + "epoch": 5.294906166219839, + "grad_norm": 0.8703194260597229, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 3950 + }, + { + "epoch": 5.3083109919571045, + "grad_norm": 1.0588284730911255, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 3960 + }, + { + "epoch": 5.32171581769437, + "grad_norm": 1.1131688356399536, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 3970 + }, + { + "epoch": 5.335120643431635, + "grad_norm": 1.1073139905929565, + "learning_rate": 0.0002, + "loss": 1.1719, + "step": 3980 + }, + { + "epoch": 5.348525469168901, + "grad_norm": 0.9269049763679504, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 3990 + }, + { + "epoch": 5.361930294906166, + "grad_norm": 0.9802212715148926, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 4000 + }, + { + "epoch": 5.375335120643432, + "grad_norm": 0.9152148365974426, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 4010 + }, + { + "epoch": 5.388739946380697, + "grad_norm": 1.0395890474319458, + "learning_rate": 0.0002, + "loss": 1.2673, + "step": 4020 + }, + { + "epoch": 5.402144772117962, + "grad_norm": 1.0989106893539429, + "learning_rate": 0.0002, + "loss": 1.2228, + "step": 4030 + }, + { + "epoch": 5.415549597855228, + "grad_norm": 1.0305225849151611, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 4040 + }, + { + "epoch": 5.428954423592494, + "grad_norm": 0.8416915535926819, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 4050 + }, + { + "epoch": 5.442359249329758, + "grad_norm": 0.9120758175849915, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 4060 + }, + { + "epoch": 5.455764075067024, + "grad_norm": 1.197936773300171, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 4070 + }, + { + "epoch": 5.46916890080429, + "grad_norm": 1.0116125345230103, + "learning_rate": 0.0002, + "loss": 1.2346, + "step": 4080 + }, + { + "epoch": 5.482573726541555, + "grad_norm": 1.048995018005371, + "learning_rate": 0.0002, + "loss": 1.1746, + "step": 4090 + }, + { + "epoch": 5.49597855227882, + "grad_norm": 0.929185152053833, + "learning_rate": 0.0002, + "loss": 1.1858, + "step": 4100 + }, + { + "epoch": 5.509383378016086, + "grad_norm": 0.9064884781837463, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 4110 + }, + { + "epoch": 5.522788203753351, + "grad_norm": 1.2009892463684082, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 4120 + }, + { + "epoch": 5.536193029490617, + "grad_norm": 0.9054455161094666, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 4130 + }, + { + "epoch": 5.549597855227882, + "grad_norm": 0.9978497624397278, + "learning_rate": 0.0002, + "loss": 1.1624, + "step": 4140 + }, + { + "epoch": 5.563002680965147, + "grad_norm": 0.9779615998268127, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4150 + }, + { + "epoch": 5.576407506702413, + "grad_norm": 1.0515185594558716, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 4160 + }, + { + "epoch": 5.589812332439678, + "grad_norm": 0.8618236184120178, + "learning_rate": 0.0002, + "loss": 1.2278, + "step": 4170 + }, + { + "epoch": 5.603217158176943, + "grad_norm": 0.9569384455680847, + "learning_rate": 0.0002, + "loss": 1.2853, + "step": 4180 + }, + { + "epoch": 5.616621983914209, + "grad_norm": 0.968923807144165, + "learning_rate": 0.0002, + "loss": 1.2824, + "step": 4190 + }, + { + "epoch": 5.630026809651475, + "grad_norm": 0.8759993314743042, + "learning_rate": 0.0002, + "loss": 1.3055, + "step": 4200 + }, + { + "epoch": 5.64343163538874, + "grad_norm": 0.9284833669662476, + "learning_rate": 0.0002, + "loss": 1.2912, + "step": 4210 + }, + { + "epoch": 5.656836461126005, + "grad_norm": 0.9293071031570435, + "learning_rate": 0.0002, + "loss": 1.2886, + "step": 4220 + }, + { + "epoch": 5.670241286863271, + "grad_norm": 0.9872161149978638, + "learning_rate": 0.0002, + "loss": 1.2704, + "step": 4230 + }, + { + "epoch": 5.683646112600536, + "grad_norm": 0.9545941948890686, + "learning_rate": 0.0002, + "loss": 1.2525, + "step": 4240 + }, + { + "epoch": 5.697050938337801, + "grad_norm": 1.0202341079711914, + "learning_rate": 0.0002, + "loss": 1.2639, + "step": 4250 + }, + { + "epoch": 5.710455764075067, + "grad_norm": 0.9821504950523376, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 4260 + }, + { + "epoch": 5.7238605898123325, + "grad_norm": 1.0581456422805786, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 4270 + }, + { + "epoch": 5.737265415549598, + "grad_norm": 0.9639395475387573, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 4280 + }, + { + "epoch": 5.750670241286863, + "grad_norm": 2.205458164215088, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 4290 + }, + { + "epoch": 5.7640750670241285, + "grad_norm": 1.0294393301010132, + "learning_rate": 0.0002, + "loss": 1.2785, + "step": 4300 + }, + { + "epoch": 5.777479892761394, + "grad_norm": 1.0360256433486938, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 4310 + }, + { + "epoch": 5.79088471849866, + "grad_norm": 0.9390154480934143, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4320 + }, + { + "epoch": 5.804289544235925, + "grad_norm": 0.9048963189125061, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 4330 + }, + { + "epoch": 5.81769436997319, + "grad_norm": 0.9310713410377502, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 4340 + }, + { + "epoch": 5.831099195710456, + "grad_norm": 1.038282871246338, + "learning_rate": 0.0002, + "loss": 1.2393, + "step": 4350 + }, + { + "epoch": 5.8445040214477215, + "grad_norm": 0.9194827079772949, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 4360 + }, + { + "epoch": 5.857908847184986, + "grad_norm": 0.9568411111831665, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 4370 + }, + { + "epoch": 5.871313672922252, + "grad_norm": 0.9088910818099976, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 4380 + }, + { + "epoch": 5.884718498659518, + "grad_norm": 1.0605647563934326, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 4390 + }, + { + "epoch": 5.898123324396783, + "grad_norm": 0.8016388416290283, + "learning_rate": 0.0002, + "loss": 1.2387, + "step": 4400 + }, + { + "epoch": 5.911528150134048, + "grad_norm": 1.0792853832244873, + "learning_rate": 0.0002, + "loss": 1.3046, + "step": 4410 + }, + { + "epoch": 5.924932975871314, + "grad_norm": 1.059403657913208, + "learning_rate": 0.0002, + "loss": 1.282, + "step": 4420 + }, + { + "epoch": 5.938337801608579, + "grad_norm": 0.87492436170578, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 4430 + }, + { + "epoch": 5.951742627345844, + "grad_norm": 1.0911097526550293, + "learning_rate": 0.0002, + "loss": 1.2373, + "step": 4440 + }, + { + "epoch": 5.96514745308311, + "grad_norm": 0.8860997557640076, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 4450 + }, + { + "epoch": 5.978552278820375, + "grad_norm": 0.9176826477050781, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 4460 + }, + { + "epoch": 5.991957104557641, + "grad_norm": 0.9018680453300476, + "learning_rate": 0.0002, + "loss": 1.2725, + "step": 4470 + }, + { + "epoch": 6.0, + "eval_loss": 2.0600433349609375, + "eval_runtime": 92.2728, + "eval_samples_per_second": 5.581, + "eval_steps_per_second": 0.704, + "step": 4476 + }, + { + "epoch": 6.005361930294906, + "grad_norm": 0.8612148761749268, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 4480 + }, + { + "epoch": 6.018766756032171, + "grad_norm": 1.170229434967041, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 4490 + }, + { + "epoch": 6.032171581769437, + "grad_norm": 1.1005233526229858, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 4500 + }, + { + "epoch": 6.045576407506703, + "grad_norm": 1.1763442754745483, + "learning_rate": 0.0002, + "loss": 1.0936, + "step": 4510 + }, + { + "epoch": 6.058981233243967, + "grad_norm": 1.0595353841781616, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 4520 + }, + { + "epoch": 6.072386058981233, + "grad_norm": 1.3554084300994873, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 4530 + }, + { + "epoch": 6.085790884718499, + "grad_norm": 1.238821268081665, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 4540 + }, + { + "epoch": 6.099195710455764, + "grad_norm": 1.0496071577072144, + "learning_rate": 0.0002, + "loss": 1.0951, + "step": 4550 + }, + { + "epoch": 6.112600536193029, + "grad_norm": 1.3410215377807617, + "learning_rate": 0.0002, + "loss": 1.1128, + "step": 4560 + }, + { + "epoch": 6.126005361930295, + "grad_norm": 1.2559033632278442, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 4570 + }, + { + "epoch": 6.13941018766756, + "grad_norm": 1.2556545734405518, + "learning_rate": 0.0002, + "loss": 1.0645, + "step": 4580 + }, + { + "epoch": 6.152815013404826, + "grad_norm": 1.050678014755249, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 4590 + }, + { + "epoch": 6.166219839142091, + "grad_norm": 1.566770076751709, + "learning_rate": 0.0002, + "loss": 1.0421, + "step": 4600 + }, + { + "epoch": 6.1796246648793565, + "grad_norm": 1.1482226848602295, + "learning_rate": 0.0002, + "loss": 1.0617, + "step": 4610 + }, + { + "epoch": 6.193029490616622, + "grad_norm": 1.2731150388717651, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 4620 + }, + { + "epoch": 6.206434316353888, + "grad_norm": 1.4135994911193848, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 4630 + }, + { + "epoch": 6.2198391420911525, + "grad_norm": 1.2925093173980713, + "learning_rate": 0.0002, + "loss": 1.0666, + "step": 4640 + }, + { + "epoch": 6.233243967828418, + "grad_norm": 1.1199861764907837, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 4650 + }, + { + "epoch": 6.246648793565684, + "grad_norm": 1.2010078430175781, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 4660 + }, + { + "epoch": 6.2600536193029495, + "grad_norm": 1.2655692100524902, + "learning_rate": 0.0002, + "loss": 1.1186, + "step": 4670 + }, + { + "epoch": 6.273458445040214, + "grad_norm": 1.0960880517959595, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 4680 + }, + { + "epoch": 6.28686327077748, + "grad_norm": 1.170759916305542, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 4690 + }, + { + "epoch": 6.3002680965147455, + "grad_norm": 1.1199755668640137, + "learning_rate": 0.0002, + "loss": 1.0852, + "step": 4700 + }, + { + "epoch": 6.31367292225201, + "grad_norm": 1.1477710008621216, + "learning_rate": 0.0002, + "loss": 1.0171, + "step": 4710 + }, + { + "epoch": 6.327077747989276, + "grad_norm": 1.0862090587615967, + "learning_rate": 0.0002, + "loss": 1.0411, + "step": 4720 + }, + { + "epoch": 6.340482573726542, + "grad_norm": 1.1428112983703613, + "learning_rate": 0.0002, + "loss": 1.0299, + "step": 4730 + }, + { + "epoch": 6.353887399463807, + "grad_norm": 1.155534029006958, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 4740 + }, + { + "epoch": 6.367292225201073, + "grad_norm": 1.2997788190841675, + "learning_rate": 0.0002, + "loss": 1.1134, + "step": 4750 + }, + { + "epoch": 6.380697050938338, + "grad_norm": 1.1087043285369873, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 4760 + }, + { + "epoch": 6.394101876675603, + "grad_norm": 1.3957210779190063, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 4770 + }, + { + "epoch": 6.407506702412869, + "grad_norm": 1.1346395015716553, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 4780 + }, + { + "epoch": 6.420911528150134, + "grad_norm": 1.3830486536026, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 4790 + }, + { + "epoch": 6.434316353887399, + "grad_norm": 1.1137559413909912, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 4800 + }, + { + "epoch": 6.447721179624665, + "grad_norm": 1.151821494102478, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 4810 + }, + { + "epoch": 6.461126005361931, + "grad_norm": 1.122589111328125, + "learning_rate": 0.0002, + "loss": 1.0821, + "step": 4820 + }, + { + "epoch": 6.474530831099195, + "grad_norm": 1.2847239971160889, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 4830 + }, + { + "epoch": 6.487935656836461, + "grad_norm": 1.027617335319519, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 4840 + }, + { + "epoch": 6.501340482573727, + "grad_norm": 1.3375194072723389, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 4850 + }, + { + "epoch": 6.514745308310992, + "grad_norm": 1.1723220348358154, + "learning_rate": 0.0002, + "loss": 1.1055, + "step": 4860 + }, + { + "epoch": 6.528150134048257, + "grad_norm": 1.7034224271774292, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 4870 + }, + { + "epoch": 6.541554959785523, + "grad_norm": 1.0840927362442017, + "learning_rate": 0.0002, + "loss": 1.0544, + "step": 4880 + }, + { + "epoch": 6.554959785522788, + "grad_norm": 1.3088481426239014, + "learning_rate": 0.0002, + "loss": 1.1194, + "step": 4890 + }, + { + "epoch": 6.568364611260054, + "grad_norm": 1.1394107341766357, + "learning_rate": 0.0002, + "loss": 1.1513, + "step": 4900 + }, + { + "epoch": 6.581769436997319, + "grad_norm": 1.0243184566497803, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 4910 + }, + { + "epoch": 6.595174262734584, + "grad_norm": 1.0814571380615234, + "learning_rate": 0.0002, + "loss": 1.2096, + "step": 4920 + }, + { + "epoch": 6.60857908847185, + "grad_norm": 1.1652323007583618, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 4930 + }, + { + "epoch": 6.621983914209116, + "grad_norm": 1.0203579664230347, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 4940 + }, + { + "epoch": 6.6353887399463805, + "grad_norm": 1.3823212385177612, + "learning_rate": 0.0002, + "loss": 1.1243, + "step": 4950 + }, + { + "epoch": 6.648793565683646, + "grad_norm": 1.248955488204956, + "learning_rate": 0.0002, + "loss": 1.1464, + "step": 4960 + }, + { + "epoch": 6.662198391420912, + "grad_norm": 1.2215739488601685, + "learning_rate": 0.0002, + "loss": 1.1278, + "step": 4970 + }, + { + "epoch": 6.6756032171581765, + "grad_norm": 1.307869553565979, + "learning_rate": 0.0002, + "loss": 1.1109, + "step": 4980 + }, + { + "epoch": 6.689008042895442, + "grad_norm": 1.4434916973114014, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 4990 + }, + { + "epoch": 6.702412868632708, + "grad_norm": 1.1840227842330933, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 5000 + }, + { + "epoch": 6.7158176943699734, + "grad_norm": 1.1775435209274292, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 5010 + }, + { + "epoch": 6.729222520107239, + "grad_norm": 1.1639968156814575, + "learning_rate": 0.0002, + "loss": 1.114, + "step": 5020 + }, + { + "epoch": 6.742627345844504, + "grad_norm": 1.3774648904800415, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 5030 + }, + { + "epoch": 6.7560321715817695, + "grad_norm": 1.0328693389892578, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 5040 + }, + { + "epoch": 6.769436997319035, + "grad_norm": 1.0495599508285522, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 5050 + }, + { + "epoch": 6.7828418230563, + "grad_norm": 1.3220133781433105, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 5060 + }, + { + "epoch": 6.7962466487935655, + "grad_norm": 1.3658279180526733, + "learning_rate": 0.0002, + "loss": 1.13, + "step": 5070 + }, + { + "epoch": 6.809651474530831, + "grad_norm": 1.3788504600524902, + "learning_rate": 0.0002, + "loss": 1.0755, + "step": 5080 + }, + { + "epoch": 6.823056300268097, + "grad_norm": 1.2342770099639893, + "learning_rate": 0.0002, + "loss": 1.1331, + "step": 5090 + }, + { + "epoch": 6.836461126005362, + "grad_norm": 1.3752578496932983, + "learning_rate": 0.0002, + "loss": 1.1761, + "step": 5100 + }, + { + "epoch": 6.849865951742627, + "grad_norm": 1.0902243852615356, + "learning_rate": 0.0002, + "loss": 1.078, + "step": 5110 + }, + { + "epoch": 6.863270777479893, + "grad_norm": 1.2125890254974365, + "learning_rate": 0.0002, + "loss": 1.1613, + "step": 5120 + }, + { + "epoch": 6.8766756032171585, + "grad_norm": 1.2979270219802856, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 5130 + }, + { + "epoch": 6.890080428954423, + "grad_norm": 1.2894749641418457, + "learning_rate": 0.0002, + "loss": 1.1207, + "step": 5140 + }, + { + "epoch": 6.903485254691689, + "grad_norm": 1.4804800748825073, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 5150 + }, + { + "epoch": 6.916890080428955, + "grad_norm": 1.1119170188903809, + "learning_rate": 0.0002, + "loss": 1.1245, + "step": 5160 + }, + { + "epoch": 6.930294906166219, + "grad_norm": 1.4991406202316284, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 5170 + }, + { + "epoch": 6.943699731903485, + "grad_norm": 1.2187672853469849, + "learning_rate": 0.0002, + "loss": 1.1025, + "step": 5180 + }, + { + "epoch": 6.957104557640751, + "grad_norm": 1.2419520616531372, + "learning_rate": 0.0002, + "loss": 1.1991, + "step": 5190 + }, + { + "epoch": 6.970509383378016, + "grad_norm": 1.359859585762024, + "learning_rate": 0.0002, + "loss": 1.1231, + "step": 5200 + }, + { + "epoch": 6.983914209115282, + "grad_norm": 1.3679486513137817, + "learning_rate": 0.0002, + "loss": 1.0882, + "step": 5210 + }, + { + "epoch": 6.997319034852547, + "grad_norm": 1.2109483480453491, + "learning_rate": 0.0002, + "loss": 1.1856, + "step": 5220 + }, + { + "epoch": 7.0, + "eval_loss": 2.194319725036621, + "eval_runtime": 93.0187, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 0.699, + "step": 5222 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4166246746842726e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5222/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e4dec75406ec3adcd4e6ea903bba6bc2f1da6111 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccc8d8aa3bdfc7f90ff90400bf4777ba33574a74dabc36d9f31cfffd29ac5504 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..166ffd9494b44e4bf4aaa1e3da4e44d6c72a23d7 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c24dc733b5c239551c6a6d497774d0231ed3a892d563197a4464bbf40c3c5d +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f2bf6e10b6df563967b62835efd97bccfde39b23 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1bcafa496d0756b7f68789477428cd13abe09540e27724304722a3d4aa2b348 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2661c7f14078f4c3582651cc05fed6d69055cc82 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d81ba016d510169ae7c5dea51d0d57cf21674b4202cc7e76d03cf372f4fe057d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6324004615527c9af8aef102acee9bbd0a242a6e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/trainer_state.json @@ -0,0 +1,4269 @@ +{ + "best_metric": 1.8150336742401123, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 5968, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.2998153269290924, + "learning_rate": 0.0002, + "loss": 1.7741, + "step": 750 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.34353747963905334, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 760 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.3506847321987152, + "learning_rate": 0.0002, + "loss": 1.6997, + "step": 770 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.3434218764305115, + "learning_rate": 0.0002, + "loss": 1.7277, + "step": 780 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.39283573627471924, + "learning_rate": 0.0002, + "loss": 1.7201, + "step": 790 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.36534103751182556, + "learning_rate": 0.0002, + "loss": 1.7134, + "step": 800 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.32713210582733154, + "learning_rate": 0.0002, + "loss": 1.73, + "step": 810 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.4298870861530304, + "learning_rate": 0.0002, + "loss": 1.733, + "step": 820 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.3652895987033844, + "learning_rate": 0.0002, + "loss": 1.7152, + "step": 830 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.4341593086719513, + "learning_rate": 0.0002, + "loss": 1.7952, + "step": 840 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.3925093412399292, + "learning_rate": 0.0002, + "loss": 1.7353, + "step": 850 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.3695056736469269, + "learning_rate": 0.0002, + "loss": 1.7484, + "step": 860 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.36138468980789185, + "learning_rate": 0.0002, + "loss": 1.7959, + "step": 870 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.33074072003364563, + "learning_rate": 0.0002, + "loss": 1.7144, + "step": 880 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.3552579879760742, + "learning_rate": 0.0002, + "loss": 1.7303, + "step": 890 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.38744238018989563, + "learning_rate": 0.0002, + "loss": 1.6857, + "step": 900 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.3563305735588074, + "learning_rate": 0.0002, + "loss": 1.7543, + "step": 910 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.35686084628105164, + "learning_rate": 0.0002, + "loss": 1.7406, + "step": 920 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.4001927077770233, + "learning_rate": 0.0002, + "loss": 1.765, + "step": 930 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.35909149050712585, + "learning_rate": 0.0002, + "loss": 1.7147, + "step": 940 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.35123375058174133, + "learning_rate": 0.0002, + "loss": 1.6712, + "step": 950 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.38013333082199097, + "learning_rate": 0.0002, + "loss": 1.7245, + "step": 960 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0002, + "loss": 1.7395, + "step": 970 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.4208183288574219, + "learning_rate": 0.0002, + "loss": 1.707, + "step": 980 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.3613564074039459, + "learning_rate": 0.0002, + "loss": 1.7122, + "step": 990 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.34058499336242676, + "learning_rate": 0.0002, + "loss": 1.6776, + "step": 1000 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.3563075065612793, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1010 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.36920854449272156, + "learning_rate": 0.0002, + "loss": 1.7167, + "step": 1020 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.3889519274234772, + "learning_rate": 0.0002, + "loss": 1.7143, + "step": 1030 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.3664555251598358, + "learning_rate": 0.0002, + "loss": 1.8023, + "step": 1040 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.38175567984580994, + "learning_rate": 0.0002, + "loss": 1.7961, + "step": 1050 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.42346763610839844, + "learning_rate": 0.0002, + "loss": 1.7363, + "step": 1060 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.3456033170223236, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1070 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.38931941986083984, + "learning_rate": 0.0002, + "loss": 1.6846, + "step": 1080 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.5473279356956482, + "learning_rate": 0.0002, + "loss": 1.7416, + "step": 1090 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.3517422676086426, + "learning_rate": 0.0002, + "loss": 1.6927, + "step": 1100 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.3511943221092224, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1110 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.3762837052345276, + "learning_rate": 0.0002, + "loss": 1.7947, + "step": 1120 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.37149128317832947, + "learning_rate": 0.0002, + "loss": 1.6893, + "step": 1130 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.3945842981338501, + "learning_rate": 0.0002, + "loss": 1.6944, + "step": 1140 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.40258195996284485, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 1150 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.3959120213985443, + "learning_rate": 0.0002, + "loss": 1.6798, + "step": 1160 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.37792712450027466, + "learning_rate": 0.0002, + "loss": 1.7789, + "step": 1170 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.4019201099872589, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 1180 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.40712273120880127, + "learning_rate": 0.0002, + "loss": 1.6887, + "step": 1190 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.4131423234939575, + "learning_rate": 0.0002, + "loss": 1.7131, + "step": 1200 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.3738194704055786, + "learning_rate": 0.0002, + "loss": 1.6757, + "step": 1210 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.3987765908241272, + "learning_rate": 0.0002, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.34117406606674194, + "learning_rate": 0.0002, + "loss": 1.7374, + "step": 1230 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.34900516271591187, + "learning_rate": 0.0002, + "loss": 1.7869, + "step": 1240 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.35759788751602173, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 1250 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.3837822377681732, + "learning_rate": 0.0002, + "loss": 1.7697, + "step": 1260 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.3671180307865143, + "learning_rate": 0.0002, + "loss": 1.7972, + "step": 1270 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.4124658703804016, + "learning_rate": 0.0002, + "loss": 1.7198, + "step": 1280 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.39059901237487793, + "learning_rate": 0.0002, + "loss": 1.8006, + "step": 1290 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.4006287157535553, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 1300 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.3606216013431549, + "learning_rate": 0.0002, + "loss": 1.8196, + "step": 1310 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.3861924111843109, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 1320 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.41432589292526245, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 1330 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.3751705586910248, + "learning_rate": 0.0002, + "loss": 1.7069, + "step": 1340 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.36217355728149414, + "learning_rate": 0.0002, + "loss": 1.717, + "step": 1350 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.35937434434890747, + "learning_rate": 0.0002, + "loss": 1.7878, + "step": 1360 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.36120304465293884, + "learning_rate": 0.0002, + "loss": 1.7026, + "step": 1370 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.36082401871681213, + "learning_rate": 0.0002, + "loss": 1.7378, + "step": 1380 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.3616413176059723, + "learning_rate": 0.0002, + "loss": 1.6938, + "step": 1390 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.3664911091327667, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1400 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.3545122444629669, + "learning_rate": 0.0002, + "loss": 1.7548, + "step": 1410 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.38186976313591003, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 1420 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.41099944710731506, + "learning_rate": 0.0002, + "loss": 1.788, + "step": 1430 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.34538620710372925, + "learning_rate": 0.0002, + "loss": 1.7377, + "step": 1440 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.35443663597106934, + "learning_rate": 0.0002, + "loss": 1.7349, + "step": 1450 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.4783519208431244, + "learning_rate": 0.0002, + "loss": 1.7457, + "step": 1460 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.36285310983657837, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 1470 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.361730694770813, + "learning_rate": 0.0002, + "loss": 1.7607, + "step": 1480 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.38347867131233215, + "learning_rate": 0.0002, + "loss": 1.7133, + "step": 1490 + }, + { + "epoch": 2.0, + "eval_loss": 1.8150336742401123, + "eval_runtime": 91.1797, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 0.713, + "step": 1492 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 0.3648935854434967, + "learning_rate": 0.0002, + "loss": 1.6673, + "step": 1500 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 0.3521469533443451, + "learning_rate": 0.0002, + "loss": 1.6754, + "step": 1510 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 0.4275520145893097, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 1520 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 0.4140888750553131, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 1530 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 0.37715452909469604, + "learning_rate": 0.0002, + "loss": 1.6237, + "step": 1540 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 0.4375513195991516, + "learning_rate": 0.0002, + "loss": 1.6426, + "step": 1550 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 0.44963088631629944, + "learning_rate": 0.0002, + "loss": 1.6675, + "step": 1560 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 0.45463916659355164, + "learning_rate": 0.0002, + "loss": 1.6731, + "step": 1570 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 0.3952806293964386, + "learning_rate": 0.0002, + "loss": 1.5928, + "step": 1580 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 0.44873616099357605, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 1590 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 0.45529067516326904, + "learning_rate": 0.0002, + "loss": 1.5953, + "step": 1600 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 0.4483625590801239, + "learning_rate": 0.0002, + "loss": 1.634, + "step": 1610 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 0.3954690992832184, + "learning_rate": 0.0002, + "loss": 1.6202, + "step": 1620 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 0.4297006130218506, + "learning_rate": 0.0002, + "loss": 1.6657, + "step": 1630 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 0.4121869206428528, + "learning_rate": 0.0002, + "loss": 1.5499, + "step": 1640 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 0.45843517780303955, + "learning_rate": 0.0002, + "loss": 1.6017, + "step": 1650 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 0.44742295145988464, + "learning_rate": 0.0002, + "loss": 1.6699, + "step": 1660 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 0.500198483467102, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 1670 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 0.4322265386581421, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 1680 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 0.480289101600647, + "learning_rate": 0.0002, + "loss": 1.6486, + "step": 1690 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 0.4532500207424164, + "learning_rate": 0.0002, + "loss": 1.6396, + "step": 1700 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 0.41848474740982056, + "learning_rate": 0.0002, + "loss": 1.6088, + "step": 1710 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 0.47211962938308716, + "learning_rate": 0.0002, + "loss": 1.6447, + "step": 1720 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 0.4273032248020172, + "learning_rate": 0.0002, + "loss": 1.7174, + "step": 1730 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 0.4660373330116272, + "learning_rate": 0.0002, + "loss": 1.617, + "step": 1740 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 0.4409862756729126, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 1750 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 0.44795849919319153, + "learning_rate": 0.0002, + "loss": 1.6579, + "step": 1760 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 0.4470100402832031, + "learning_rate": 0.0002, + "loss": 1.5736, + "step": 1770 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 0.4184521436691284, + "learning_rate": 0.0002, + "loss": 1.6277, + "step": 1780 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 0.4572308659553528, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 1790 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 0.4888782501220703, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 1800 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 0.4442083239555359, + "learning_rate": 0.0002, + "loss": 1.7168, + "step": 1810 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 0.4986329972743988, + "learning_rate": 0.0002, + "loss": 1.6375, + "step": 1820 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 0.47918054461479187, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 1830 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 0.42569679021835327, + "learning_rate": 0.0002, + "loss": 1.5969, + "step": 1840 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 0.4683821201324463, + "learning_rate": 0.0002, + "loss": 1.5751, + "step": 1850 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 0.43605074286460876, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 1860 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 0.4189167618751526, + "learning_rate": 0.0002, + "loss": 1.6885, + "step": 1870 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 0.5860861539840698, + "learning_rate": 0.0002, + "loss": 1.6493, + "step": 1880 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 0.4568740427494049, + "learning_rate": 0.0002, + "loss": 1.6563, + "step": 1890 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 0.4672846496105194, + "learning_rate": 0.0002, + "loss": 1.6653, + "step": 1900 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 0.4280472993850708, + "learning_rate": 0.0002, + "loss": 1.6037, + "step": 1910 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 0.590728759765625, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 1920 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 0.4205126166343689, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1930 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 0.47869905829429626, + "learning_rate": 0.0002, + "loss": 1.5045, + "step": 1940 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 0.4607323408126831, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 1950 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 0.4762210547924042, + "learning_rate": 0.0002, + "loss": 1.644, + "step": 1960 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 0.46832647919654846, + "learning_rate": 0.0002, + "loss": 1.6316, + "step": 1970 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 0.4368574619293213, + "learning_rate": 0.0002, + "loss": 1.6591, + "step": 1980 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 0.5248273611068726, + "learning_rate": 0.0002, + "loss": 1.6359, + "step": 1990 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 0.46777117252349854, + "learning_rate": 0.0002, + "loss": 1.6879, + "step": 2000 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 0.5201858878135681, + "learning_rate": 0.0002, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 0.46777284145355225, + "learning_rate": 0.0002, + "loss": 1.6337, + "step": 2020 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 0.46736642718315125, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2030 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 0.4647925794124603, + "learning_rate": 0.0002, + "loss": 1.6356, + "step": 2040 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 0.4298803508281708, + "learning_rate": 0.0002, + "loss": 1.732, + "step": 2050 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 0.45485609769821167, + "learning_rate": 0.0002, + "loss": 1.6648, + "step": 2060 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 0.43687865138053894, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2070 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 0.4319164752960205, + "learning_rate": 0.0002, + "loss": 1.6904, + "step": 2080 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 0.47792428731918335, + "learning_rate": 0.0002, + "loss": 1.6531, + "step": 2090 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 0.5322234034538269, + "learning_rate": 0.0002, + "loss": 1.6417, + "step": 2100 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 0.47517943382263184, + "learning_rate": 0.0002, + "loss": 1.6634, + "step": 2110 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 0.45799025893211365, + "learning_rate": 0.0002, + "loss": 1.6329, + "step": 2120 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 0.45852357149124146, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2130 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 0.4617408514022827, + "learning_rate": 0.0002, + "loss": 1.61, + "step": 2140 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 0.44205963611602783, + "learning_rate": 0.0002, + "loss": 1.6445, + "step": 2150 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 0.47173425555229187, + "learning_rate": 0.0002, + "loss": 1.6231, + "step": 2160 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 0.46379899978637695, + "learning_rate": 0.0002, + "loss": 1.6425, + "step": 2170 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 0.4999759793281555, + "learning_rate": 0.0002, + "loss": 1.6403, + "step": 2180 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 0.4607947766780853, + "learning_rate": 0.0002, + "loss": 1.6741, + "step": 2190 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 0.4359836280345917, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 2200 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 0.5195549726486206, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 2210 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 0.4914056062698364, + "learning_rate": 0.0002, + "loss": 1.6348, + "step": 2220 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 0.4647377133369446, + "learning_rate": 0.0002, + "loss": 1.6594, + "step": 2230 + }, + { + "epoch": 3.0, + "eval_loss": 1.8368606567382812, + "eval_runtime": 90.5623, + "eval_samples_per_second": 5.687, + "eval_steps_per_second": 0.718, + "step": 2238 + }, + { + "epoch": 3.002680965147453, + "grad_norm": 0.40689945220947266, + "learning_rate": 0.0002, + "loss": 1.5704, + "step": 2240 + }, + { + "epoch": 3.0160857908847185, + "grad_norm": 0.4699273705482483, + "learning_rate": 0.0002, + "loss": 1.5961, + "step": 2250 + }, + { + "epoch": 3.0294906166219837, + "grad_norm": 0.5531830787658691, + "learning_rate": 0.0002, + "loss": 1.5182, + "step": 2260 + }, + { + "epoch": 3.0428954423592494, + "grad_norm": 0.5441790223121643, + "learning_rate": 0.0002, + "loss": 1.4924, + "step": 2270 + }, + { + "epoch": 3.0563002680965146, + "grad_norm": 0.6145012974739075, + "learning_rate": 0.0002, + "loss": 1.4953, + "step": 2280 + }, + { + "epoch": 3.06970509383378, + "grad_norm": 0.6997102499008179, + "learning_rate": 0.0002, + "loss": 1.4861, + "step": 2290 + }, + { + "epoch": 3.0831099195710454, + "grad_norm": 0.6082330942153931, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 2300 + }, + { + "epoch": 3.096514745308311, + "grad_norm": 0.5294155478477478, + "learning_rate": 0.0002, + "loss": 1.5377, + "step": 2310 + }, + { + "epoch": 3.1099195710455763, + "grad_norm": 0.7200340032577515, + "learning_rate": 0.0002, + "loss": 1.5452, + "step": 2320 + }, + { + "epoch": 3.123324396782842, + "grad_norm": 0.721092939376831, + "learning_rate": 0.0002, + "loss": 1.5296, + "step": 2330 + }, + { + "epoch": 3.136729222520107, + "grad_norm": 0.5344305038452148, + "learning_rate": 0.0002, + "loss": 1.5307, + "step": 2340 + }, + { + "epoch": 3.1501340482573728, + "grad_norm": 0.5533145070075989, + "learning_rate": 0.0002, + "loss": 1.4347, + "step": 2350 + }, + { + "epoch": 3.163538873994638, + "grad_norm": 0.5976856350898743, + "learning_rate": 0.0002, + "loss": 1.529, + "step": 2360 + }, + { + "epoch": 3.1769436997319036, + "grad_norm": 0.4974960386753082, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2370 + }, + { + "epoch": 3.190348525469169, + "grad_norm": 0.6377840042114258, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 2380 + }, + { + "epoch": 3.2037533512064345, + "grad_norm": 0.5447293519973755, + "learning_rate": 0.0002, + "loss": 1.5322, + "step": 2390 + }, + { + "epoch": 3.2171581769436997, + "grad_norm": 0.49577030539512634, + "learning_rate": 0.0002, + "loss": 1.5127, + "step": 2400 + }, + { + "epoch": 3.2305630026809653, + "grad_norm": 0.5588275790214539, + "learning_rate": 0.0002, + "loss": 1.4768, + "step": 2410 + }, + { + "epoch": 3.2439678284182305, + "grad_norm": 0.6429149508476257, + "learning_rate": 0.0002, + "loss": 1.4755, + "step": 2420 + }, + { + "epoch": 3.257372654155496, + "grad_norm": 0.5713154673576355, + "learning_rate": 0.0002, + "loss": 1.5596, + "step": 2430 + }, + { + "epoch": 3.2707774798927614, + "grad_norm": 0.6348955035209656, + "learning_rate": 0.0002, + "loss": 1.4763, + "step": 2440 + }, + { + "epoch": 3.284182305630027, + "grad_norm": 0.5675528645515442, + "learning_rate": 0.0002, + "loss": 1.509, + "step": 2450 + }, + { + "epoch": 3.297587131367292, + "grad_norm": 0.5570188164710999, + "learning_rate": 0.0002, + "loss": 1.5867, + "step": 2460 + }, + { + "epoch": 3.310991957104558, + "grad_norm": 0.6029602289199829, + "learning_rate": 0.0002, + "loss": 1.554, + "step": 2470 + }, + { + "epoch": 3.324396782841823, + "grad_norm": 0.523206353187561, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2480 + }, + { + "epoch": 3.3378016085790883, + "grad_norm": 0.5912408828735352, + "learning_rate": 0.0002, + "loss": 1.4854, + "step": 2490 + }, + { + "epoch": 3.351206434316354, + "grad_norm": 0.5524865984916687, + "learning_rate": 0.0002, + "loss": 1.5097, + "step": 2500 + }, + { + "epoch": 3.3646112600536195, + "grad_norm": 0.60386061668396, + "learning_rate": 0.0002, + "loss": 1.5064, + "step": 2510 + }, + { + "epoch": 3.3780160857908847, + "grad_norm": 0.5838595628738403, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 2520 + }, + { + "epoch": 3.39142091152815, + "grad_norm": 0.5400974154472351, + "learning_rate": 0.0002, + "loss": 1.4615, + "step": 2530 + }, + { + "epoch": 3.4048257372654156, + "grad_norm": 0.6150162220001221, + "learning_rate": 0.0002, + "loss": 1.5349, + "step": 2540 + }, + { + "epoch": 3.418230563002681, + "grad_norm": 0.5279412269592285, + "learning_rate": 0.0002, + "loss": 1.5978, + "step": 2550 + }, + { + "epoch": 3.4316353887399464, + "grad_norm": 0.5974063873291016, + "learning_rate": 0.0002, + "loss": 1.5063, + "step": 2560 + }, + { + "epoch": 3.4450402144772116, + "grad_norm": 0.661573052406311, + "learning_rate": 0.0002, + "loss": 1.5825, + "step": 2570 + }, + { + "epoch": 3.4584450402144773, + "grad_norm": 0.577880322933197, + "learning_rate": 0.0002, + "loss": 1.5204, + "step": 2580 + }, + { + "epoch": 3.4718498659517425, + "grad_norm": 0.5532318949699402, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 2590 + }, + { + "epoch": 3.485254691689008, + "grad_norm": 0.5764921307563782, + "learning_rate": 0.0002, + "loss": 1.4933, + "step": 2600 + }, + { + "epoch": 3.4986595174262733, + "grad_norm": 0.6145682334899902, + "learning_rate": 0.0002, + "loss": 1.4355, + "step": 2610 + }, + { + "epoch": 3.512064343163539, + "grad_norm": 0.6561126112937927, + "learning_rate": 0.0002, + "loss": 1.4968, + "step": 2620 + }, + { + "epoch": 3.525469168900804, + "grad_norm": 0.5673288106918335, + "learning_rate": 0.0002, + "loss": 1.5309, + "step": 2630 + }, + { + "epoch": 3.53887399463807, + "grad_norm": 0.6215338706970215, + "learning_rate": 0.0002, + "loss": 1.5274, + "step": 2640 + }, + { + "epoch": 3.552278820375335, + "grad_norm": 0.5512040853500366, + "learning_rate": 0.0002, + "loss": 1.5117, + "step": 2650 + }, + { + "epoch": 3.5656836461126007, + "grad_norm": 0.49503496289253235, + "learning_rate": 0.0002, + "loss": 1.5188, + "step": 2660 + }, + { + "epoch": 3.579088471849866, + "grad_norm": 0.5714912414550781, + "learning_rate": 0.0002, + "loss": 1.524, + "step": 2670 + }, + { + "epoch": 3.592493297587131, + "grad_norm": 0.6883154511451721, + "learning_rate": 0.0002, + "loss": 1.4651, + "step": 2680 + }, + { + "epoch": 3.6058981233243967, + "grad_norm": 0.5989556908607483, + "learning_rate": 0.0002, + "loss": 1.5174, + "step": 2690 + }, + { + "epoch": 3.6193029490616624, + "grad_norm": 0.630268394947052, + "learning_rate": 0.0002, + "loss": 1.5335, + "step": 2700 + }, + { + "epoch": 3.6327077747989276, + "grad_norm": 0.5819358229637146, + "learning_rate": 0.0002, + "loss": 1.4681, + "step": 2710 + }, + { + "epoch": 3.646112600536193, + "grad_norm": 0.6102097034454346, + "learning_rate": 0.0002, + "loss": 1.5676, + "step": 2720 + }, + { + "epoch": 3.6595174262734584, + "grad_norm": 0.6858501434326172, + "learning_rate": 0.0002, + "loss": 1.5566, + "step": 2730 + }, + { + "epoch": 3.672922252010724, + "grad_norm": 0.6328608393669128, + "learning_rate": 0.0002, + "loss": 1.5242, + "step": 2740 + }, + { + "epoch": 3.6863270777479893, + "grad_norm": 0.5366981029510498, + "learning_rate": 0.0002, + "loss": 1.5211, + "step": 2750 + }, + { + "epoch": 3.6997319034852545, + "grad_norm": 0.7048938274383545, + "learning_rate": 0.0002, + "loss": 1.5532, + "step": 2760 + }, + { + "epoch": 3.71313672922252, + "grad_norm": 0.5371938347816467, + "learning_rate": 0.0002, + "loss": 1.5001, + "step": 2770 + }, + { + "epoch": 3.726541554959786, + "grad_norm": 0.6142212152481079, + "learning_rate": 0.0002, + "loss": 1.557, + "step": 2780 + }, + { + "epoch": 3.739946380697051, + "grad_norm": 0.6164522171020508, + "learning_rate": 0.0002, + "loss": 1.5191, + "step": 2790 + }, + { + "epoch": 3.753351206434316, + "grad_norm": 0.7511836886405945, + "learning_rate": 0.0002, + "loss": 1.5071, + "step": 2800 + }, + { + "epoch": 3.766756032171582, + "grad_norm": 0.6194717288017273, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 2810 + }, + { + "epoch": 3.780160857908847, + "grad_norm": 0.676721453666687, + "learning_rate": 0.0002, + "loss": 1.5721, + "step": 2820 + }, + { + "epoch": 3.7935656836461127, + "grad_norm": 0.5646911263465881, + "learning_rate": 0.0002, + "loss": 1.502, + "step": 2830 + }, + { + "epoch": 3.806970509383378, + "grad_norm": 0.5874826908111572, + "learning_rate": 0.0002, + "loss": 1.4871, + "step": 2840 + }, + { + "epoch": 3.8203753351206435, + "grad_norm": 0.6395232677459717, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2850 + }, + { + "epoch": 3.8337801608579087, + "grad_norm": 0.624563992023468, + "learning_rate": 0.0002, + "loss": 1.5088, + "step": 2860 + }, + { + "epoch": 3.8471849865951744, + "grad_norm": 0.59019935131073, + "learning_rate": 0.0002, + "loss": 1.479, + "step": 2870 + }, + { + "epoch": 3.8605898123324396, + "grad_norm": 0.6700479984283447, + "learning_rate": 0.0002, + "loss": 1.4693, + "step": 2880 + }, + { + "epoch": 3.8739946380697052, + "grad_norm": 0.6131282448768616, + "learning_rate": 0.0002, + "loss": 1.5032, + "step": 2890 + }, + { + "epoch": 3.8873994638069704, + "grad_norm": 0.6807777881622314, + "learning_rate": 0.0002, + "loss": 1.5446, + "step": 2900 + }, + { + "epoch": 3.900804289544236, + "grad_norm": 0.5297217965126038, + "learning_rate": 0.0002, + "loss": 1.5618, + "step": 2910 + }, + { + "epoch": 3.9142091152815013, + "grad_norm": 0.5795540809631348, + "learning_rate": 0.0002, + "loss": 1.5046, + "step": 2920 + }, + { + "epoch": 3.927613941018767, + "grad_norm": 0.5549747347831726, + "learning_rate": 0.0002, + "loss": 1.5155, + "step": 2930 + }, + { + "epoch": 3.941018766756032, + "grad_norm": 0.5895092487335205, + "learning_rate": 0.0002, + "loss": 1.5932, + "step": 2940 + }, + { + "epoch": 3.9544235924932973, + "grad_norm": 0.590002715587616, + "learning_rate": 0.0002, + "loss": 1.5831, + "step": 2950 + }, + { + "epoch": 3.967828418230563, + "grad_norm": 0.7847695350646973, + "learning_rate": 0.0002, + "loss": 1.592, + "step": 2960 + }, + { + "epoch": 3.9812332439678286, + "grad_norm": 0.5845848321914673, + "learning_rate": 0.0002, + "loss": 1.4892, + "step": 2970 + }, + { + "epoch": 3.994638069705094, + "grad_norm": 0.5861571431159973, + "learning_rate": 0.0002, + "loss": 1.5094, + "step": 2980 + }, + { + "epoch": 4.0, + "eval_loss": 1.8821998834609985, + "eval_runtime": 90.8701, + "eval_samples_per_second": 5.667, + "eval_steps_per_second": 0.715, + "step": 2984 + }, + { + "epoch": 4.008042895442359, + "grad_norm": 0.6209918260574341, + "learning_rate": 0.0002, + "loss": 1.4156, + "step": 2990 + }, + { + "epoch": 4.021447721179625, + "grad_norm": 0.607226550579071, + "learning_rate": 0.0002, + "loss": 1.4244, + "step": 3000 + }, + { + "epoch": 4.03485254691689, + "grad_norm": 0.6677961349487305, + "learning_rate": 0.0002, + "loss": 1.3652, + "step": 3010 + }, + { + "epoch": 4.048257372654155, + "grad_norm": 0.9053248763084412, + "learning_rate": 0.0002, + "loss": 1.3815, + "step": 3020 + }, + { + "epoch": 4.061662198391421, + "grad_norm": 0.6815084218978882, + "learning_rate": 0.0002, + "loss": 1.4346, + "step": 3030 + }, + { + "epoch": 4.075067024128686, + "grad_norm": 0.6709407567977905, + "learning_rate": 0.0002, + "loss": 1.3, + "step": 3040 + }, + { + "epoch": 4.088471849865952, + "grad_norm": 0.728184163570404, + "learning_rate": 0.0002, + "loss": 1.3406, + "step": 3050 + }, + { + "epoch": 4.101876675603217, + "grad_norm": 0.817628800868988, + "learning_rate": 0.0002, + "loss": 1.3404, + "step": 3060 + }, + { + "epoch": 4.115281501340482, + "grad_norm": 0.7384206056594849, + "learning_rate": 0.0002, + "loss": 1.3496, + "step": 3070 + }, + { + "epoch": 4.128686327077748, + "grad_norm": 0.7380280494689941, + "learning_rate": 0.0002, + "loss": 1.3621, + "step": 3080 + }, + { + "epoch": 4.142091152815014, + "grad_norm": 0.8197277188301086, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 3090 + }, + { + "epoch": 4.1554959785522785, + "grad_norm": 0.8971617817878723, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3100 + }, + { + "epoch": 4.168900804289544, + "grad_norm": 0.7409387826919556, + "learning_rate": 0.0002, + "loss": 1.3564, + "step": 3110 + }, + { + "epoch": 4.18230563002681, + "grad_norm": 0.6948909163475037, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3120 + }, + { + "epoch": 4.195710455764075, + "grad_norm": 0.7619595527648926, + "learning_rate": 0.0002, + "loss": 1.3397, + "step": 3130 + }, + { + "epoch": 4.20911528150134, + "grad_norm": 0.7657106518745422, + "learning_rate": 0.0002, + "loss": 1.3864, + "step": 3140 + }, + { + "epoch": 4.222520107238606, + "grad_norm": 0.6919401288032532, + "learning_rate": 0.0002, + "loss": 1.4017, + "step": 3150 + }, + { + "epoch": 4.2359249329758715, + "grad_norm": 0.6991415023803711, + "learning_rate": 0.0002, + "loss": 1.3692, + "step": 3160 + }, + { + "epoch": 4.249329758713137, + "grad_norm": 0.7349252700805664, + "learning_rate": 0.0002, + "loss": 1.3651, + "step": 3170 + }, + { + "epoch": 4.262734584450402, + "grad_norm": 0.8838240504264832, + "learning_rate": 0.0002, + "loss": 1.367, + "step": 3180 + }, + { + "epoch": 4.2761394101876675, + "grad_norm": 0.7240107655525208, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3190 + }, + { + "epoch": 4.289544235924933, + "grad_norm": 0.7338636517524719, + "learning_rate": 0.0002, + "loss": 1.3671, + "step": 3200 + }, + { + "epoch": 4.302949061662199, + "grad_norm": 0.7891436815261841, + "learning_rate": 0.0002, + "loss": 1.448, + "step": 3210 + }, + { + "epoch": 4.316353887399464, + "grad_norm": 0.7407845854759216, + "learning_rate": 0.0002, + "loss": 1.3291, + "step": 3220 + }, + { + "epoch": 4.329758713136729, + "grad_norm": 0.7635948061943054, + "learning_rate": 0.0002, + "loss": 1.3899, + "step": 3230 + }, + { + "epoch": 4.343163538873995, + "grad_norm": 0.7478461861610413, + "learning_rate": 0.0002, + "loss": 1.3384, + "step": 3240 + }, + { + "epoch": 4.35656836461126, + "grad_norm": 0.7684298157691956, + "learning_rate": 0.0002, + "loss": 1.388, + "step": 3250 + }, + { + "epoch": 4.369973190348525, + "grad_norm": 1.0287525653839111, + "learning_rate": 0.0002, + "loss": 1.4233, + "step": 3260 + }, + { + "epoch": 4.383378016085791, + "grad_norm": 0.750616192817688, + "learning_rate": 0.0002, + "loss": 1.3542, + "step": 3270 + }, + { + "epoch": 4.396782841823057, + "grad_norm": 0.7911648750305176, + "learning_rate": 0.0002, + "loss": 1.3158, + "step": 3280 + }, + { + "epoch": 4.410187667560321, + "grad_norm": 0.9156750440597534, + "learning_rate": 0.0002, + "loss": 1.3896, + "step": 3290 + }, + { + "epoch": 4.423592493297587, + "grad_norm": 1.0180249214172363, + "learning_rate": 0.0002, + "loss": 1.3887, + "step": 3300 + }, + { + "epoch": 4.436997319034853, + "grad_norm": 1.0792218446731567, + "learning_rate": 0.0002, + "loss": 1.4143, + "step": 3310 + }, + { + "epoch": 4.450402144772118, + "grad_norm": 0.8027488589286804, + "learning_rate": 0.0002, + "loss": 1.3314, + "step": 3320 + }, + { + "epoch": 4.463806970509383, + "grad_norm": 0.8037815093994141, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3330 + }, + { + "epoch": 4.477211796246649, + "grad_norm": 0.7907946705818176, + "learning_rate": 0.0002, + "loss": 1.4124, + "step": 3340 + }, + { + "epoch": 4.490616621983914, + "grad_norm": 0.7206302881240845, + "learning_rate": 0.0002, + "loss": 1.443, + "step": 3350 + }, + { + "epoch": 4.50402144772118, + "grad_norm": 0.7697674632072449, + "learning_rate": 0.0002, + "loss": 1.3822, + "step": 3360 + }, + { + "epoch": 4.517426273458445, + "grad_norm": 0.7315130829811096, + "learning_rate": 0.0002, + "loss": 1.3923, + "step": 3370 + }, + { + "epoch": 4.53083109919571, + "grad_norm": 0.7896273136138916, + "learning_rate": 0.0002, + "loss": 1.3598, + "step": 3380 + }, + { + "epoch": 4.544235924932976, + "grad_norm": 0.7720345258712769, + "learning_rate": 0.0002, + "loss": 1.3947, + "step": 3390 + }, + { + "epoch": 4.557640750670242, + "grad_norm": 0.8304631114006042, + "learning_rate": 0.0002, + "loss": 1.404, + "step": 3400 + }, + { + "epoch": 4.571045576407506, + "grad_norm": 0.7408214211463928, + "learning_rate": 0.0002, + "loss": 1.3712, + "step": 3410 + }, + { + "epoch": 4.584450402144772, + "grad_norm": 0.8100157976150513, + "learning_rate": 0.0002, + "loss": 1.3957, + "step": 3420 + }, + { + "epoch": 4.597855227882038, + "grad_norm": 0.7829574942588806, + "learning_rate": 0.0002, + "loss": 1.47, + "step": 3430 + }, + { + "epoch": 4.6112600536193025, + "grad_norm": 0.9529728889465332, + "learning_rate": 0.0002, + "loss": 1.3684, + "step": 3440 + }, + { + "epoch": 4.624664879356568, + "grad_norm": 1.0769460201263428, + "learning_rate": 0.0002, + "loss": 1.3984, + "step": 3450 + }, + { + "epoch": 4.638069705093834, + "grad_norm": 0.8941947817802429, + "learning_rate": 0.0002, + "loss": 1.4063, + "step": 3460 + }, + { + "epoch": 4.651474530831099, + "grad_norm": 0.7860096096992493, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 3470 + }, + { + "epoch": 4.664879356568365, + "grad_norm": 0.8184044361114502, + "learning_rate": 0.0002, + "loss": 1.3782, + "step": 3480 + }, + { + "epoch": 4.67828418230563, + "grad_norm": 0.7852717638015747, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 3490 + }, + { + "epoch": 4.6916890080428955, + "grad_norm": 0.750586986541748, + "learning_rate": 0.0002, + "loss": 1.4139, + "step": 3500 + }, + { + "epoch": 4.705093833780161, + "grad_norm": 0.7966068983078003, + "learning_rate": 0.0002, + "loss": 1.3224, + "step": 3510 + }, + { + "epoch": 4.718498659517426, + "grad_norm": 0.8387030959129333, + "learning_rate": 0.0002, + "loss": 1.4052, + "step": 3520 + }, + { + "epoch": 4.7319034852546915, + "grad_norm": 0.7373180389404297, + "learning_rate": 0.0002, + "loss": 1.4541, + "step": 3530 + }, + { + "epoch": 4.745308310991957, + "grad_norm": 0.8415353894233704, + "learning_rate": 0.0002, + "loss": 1.4148, + "step": 3540 + }, + { + "epoch": 4.758713136729223, + "grad_norm": 0.7155488133430481, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 3550 + }, + { + "epoch": 4.772117962466488, + "grad_norm": 0.697658896446228, + "learning_rate": 0.0002, + "loss": 1.3454, + "step": 3560 + }, + { + "epoch": 4.785522788203753, + "grad_norm": 0.8722999095916748, + "learning_rate": 0.0002, + "loss": 1.4002, + "step": 3570 + }, + { + "epoch": 4.798927613941019, + "grad_norm": 0.8106381297111511, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 3580 + }, + { + "epoch": 4.8123324396782845, + "grad_norm": 0.9320500493049622, + "learning_rate": 0.0002, + "loss": 1.3525, + "step": 3590 + }, + { + "epoch": 4.825737265415549, + "grad_norm": 0.7583016157150269, + "learning_rate": 0.0002, + "loss": 1.3675, + "step": 3600 + }, + { + "epoch": 4.839142091152815, + "grad_norm": 0.790050208568573, + "learning_rate": 0.0002, + "loss": 1.3761, + "step": 3610 + }, + { + "epoch": 4.8525469168900806, + "grad_norm": 0.7481580972671509, + "learning_rate": 0.0002, + "loss": 1.4144, + "step": 3620 + }, + { + "epoch": 4.865951742627346, + "grad_norm": 0.8709374666213989, + "learning_rate": 0.0002, + "loss": 1.4424, + "step": 3630 + }, + { + "epoch": 4.879356568364611, + "grad_norm": 0.7266733050346375, + "learning_rate": 0.0002, + "loss": 1.3758, + "step": 3640 + }, + { + "epoch": 4.892761394101877, + "grad_norm": 0.7669504880905151, + "learning_rate": 0.0002, + "loss": 1.4254, + "step": 3650 + }, + { + "epoch": 4.906166219839142, + "grad_norm": 0.7855764627456665, + "learning_rate": 0.0002, + "loss": 1.3956, + "step": 3660 + }, + { + "epoch": 4.919571045576408, + "grad_norm": 0.8145440816879272, + "learning_rate": 0.0002, + "loss": 1.4609, + "step": 3670 + }, + { + "epoch": 4.932975871313673, + "grad_norm": 0.7487278580665588, + "learning_rate": 0.0002, + "loss": 1.4152, + "step": 3680 + }, + { + "epoch": 4.946380697050938, + "grad_norm": 0.8390981554985046, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 3690 + }, + { + "epoch": 4.959785522788204, + "grad_norm": 0.663752555847168, + "learning_rate": 0.0002, + "loss": 1.3504, + "step": 3700 + }, + { + "epoch": 4.973190348525469, + "grad_norm": 0.7821969985961914, + "learning_rate": 0.0002, + "loss": 1.3453, + "step": 3710 + }, + { + "epoch": 4.986595174262734, + "grad_norm": 0.9157266020774841, + "learning_rate": 0.0002, + "loss": 1.3936, + "step": 3720 + }, + { + "epoch": 5.0, + "grad_norm": 0.7683535814285278, + "learning_rate": 0.0002, + "loss": 1.3925, + "step": 3730 + }, + { + "epoch": 5.0, + "eval_loss": 1.9639414548873901, + "eval_runtime": 92.0173, + "eval_samples_per_second": 5.597, + "eval_steps_per_second": 0.706, + "step": 3730 + }, + { + "epoch": 5.013404825737266, + "grad_norm": 1.3000373840332031, + "learning_rate": 0.0002, + "loss": 1.1852, + "step": 3740 + }, + { + "epoch": 5.02680965147453, + "grad_norm": 0.8916982412338257, + "learning_rate": 0.0002, + "loss": 1.1922, + "step": 3750 + }, + { + "epoch": 5.040214477211796, + "grad_norm": 1.0365116596221924, + "learning_rate": 0.0002, + "loss": 1.2113, + "step": 3760 + }, + { + "epoch": 5.053619302949062, + "grad_norm": 0.999420166015625, + "learning_rate": 0.0002, + "loss": 1.2941, + "step": 3770 + }, + { + "epoch": 5.067024128686327, + "grad_norm": 1.093572974205017, + "learning_rate": 0.0002, + "loss": 1.24, + "step": 3780 + }, + { + "epoch": 5.080428954423592, + "grad_norm": 1.1137515306472778, + "learning_rate": 0.0002, + "loss": 1.2345, + "step": 3790 + }, + { + "epoch": 5.093833780160858, + "grad_norm": 1.0328283309936523, + "learning_rate": 0.0002, + "loss": 1.1646, + "step": 3800 + }, + { + "epoch": 5.107238605898123, + "grad_norm": 1.0444108247756958, + "learning_rate": 0.0002, + "loss": 1.1716, + "step": 3810 + }, + { + "epoch": 5.120643431635389, + "grad_norm": 0.858148992061615, + "learning_rate": 0.0002, + "loss": 1.2226, + "step": 3820 + }, + { + "epoch": 5.134048257372654, + "grad_norm": 0.94026780128479, + "learning_rate": 0.0002, + "loss": 1.1691, + "step": 3830 + }, + { + "epoch": 5.1474530831099194, + "grad_norm": 0.8987152576446533, + "learning_rate": 0.0002, + "loss": 1.1902, + "step": 3840 + }, + { + "epoch": 5.160857908847185, + "grad_norm": 0.922997236251831, + "learning_rate": 0.0002, + "loss": 1.1562, + "step": 3850 + }, + { + "epoch": 5.174262734584451, + "grad_norm": 0.9172422289848328, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 3860 + }, + { + "epoch": 5.1876675603217155, + "grad_norm": 1.02277672290802, + "learning_rate": 0.0002, + "loss": 1.1802, + "step": 3870 + }, + { + "epoch": 5.201072386058981, + "grad_norm": 1.093826413154602, + "learning_rate": 0.0002, + "loss": 1.2206, + "step": 3880 + }, + { + "epoch": 5.214477211796247, + "grad_norm": 0.9362447261810303, + "learning_rate": 0.0002, + "loss": 1.2578, + "step": 3890 + }, + { + "epoch": 5.227882037533512, + "grad_norm": 1.0564044713974, + "learning_rate": 0.0002, + "loss": 1.2335, + "step": 3900 + }, + { + "epoch": 5.241286863270777, + "grad_norm": 0.869575023651123, + "learning_rate": 0.0002, + "loss": 1.1936, + "step": 3910 + }, + { + "epoch": 5.254691689008043, + "grad_norm": 1.0383203029632568, + "learning_rate": 0.0002, + "loss": 1.2301, + "step": 3920 + }, + { + "epoch": 5.2680965147453085, + "grad_norm": 0.9146919846534729, + "learning_rate": 0.0002, + "loss": 1.2076, + "step": 3930 + }, + { + "epoch": 5.281501340482574, + "grad_norm": 0.9226430654525757, + "learning_rate": 0.0002, + "loss": 1.2804, + "step": 3940 + }, + { + "epoch": 5.294906166219839, + "grad_norm": 0.8703194260597229, + "learning_rate": 0.0002, + "loss": 1.2506, + "step": 3950 + }, + { + "epoch": 5.3083109919571045, + "grad_norm": 1.0588284730911255, + "learning_rate": 0.0002, + "loss": 1.2533, + "step": 3960 + }, + { + "epoch": 5.32171581769437, + "grad_norm": 1.1131688356399536, + "learning_rate": 0.0002, + "loss": 1.2405, + "step": 3970 + }, + { + "epoch": 5.335120643431635, + "grad_norm": 1.1073139905929565, + "learning_rate": 0.0002, + "loss": 1.1719, + "step": 3980 + }, + { + "epoch": 5.348525469168901, + "grad_norm": 0.9269049763679504, + "learning_rate": 0.0002, + "loss": 1.2375, + "step": 3990 + }, + { + "epoch": 5.361930294906166, + "grad_norm": 0.9802212715148926, + "learning_rate": 0.0002, + "loss": 1.2513, + "step": 4000 + }, + { + "epoch": 5.375335120643432, + "grad_norm": 0.9152148365974426, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 4010 + }, + { + "epoch": 5.388739946380697, + "grad_norm": 1.0395890474319458, + "learning_rate": 0.0002, + "loss": 1.2673, + "step": 4020 + }, + { + "epoch": 5.402144772117962, + "grad_norm": 1.0989106893539429, + "learning_rate": 0.0002, + "loss": 1.2228, + "step": 4030 + }, + { + "epoch": 5.415549597855228, + "grad_norm": 1.0305225849151611, + "learning_rate": 0.0002, + "loss": 1.2717, + "step": 4040 + }, + { + "epoch": 5.428954423592494, + "grad_norm": 0.8416915535926819, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 4050 + }, + { + "epoch": 5.442359249329758, + "grad_norm": 0.9120758175849915, + "learning_rate": 0.0002, + "loss": 1.2205, + "step": 4060 + }, + { + "epoch": 5.455764075067024, + "grad_norm": 1.197936773300171, + "learning_rate": 0.0002, + "loss": 1.2812, + "step": 4070 + }, + { + "epoch": 5.46916890080429, + "grad_norm": 1.0116125345230103, + "learning_rate": 0.0002, + "loss": 1.2346, + "step": 4080 + }, + { + "epoch": 5.482573726541555, + "grad_norm": 1.048995018005371, + "learning_rate": 0.0002, + "loss": 1.1746, + "step": 4090 + }, + { + "epoch": 5.49597855227882, + "grad_norm": 0.929185152053833, + "learning_rate": 0.0002, + "loss": 1.1858, + "step": 4100 + }, + { + "epoch": 5.509383378016086, + "grad_norm": 0.9064884781837463, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 4110 + }, + { + "epoch": 5.522788203753351, + "grad_norm": 1.2009892463684082, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 4120 + }, + { + "epoch": 5.536193029490617, + "grad_norm": 0.9054455161094666, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 4130 + }, + { + "epoch": 5.549597855227882, + "grad_norm": 0.9978497624397278, + "learning_rate": 0.0002, + "loss": 1.1624, + "step": 4140 + }, + { + "epoch": 5.563002680965147, + "grad_norm": 0.9779615998268127, + "learning_rate": 0.0002, + "loss": 1.2814, + "step": 4150 + }, + { + "epoch": 5.576407506702413, + "grad_norm": 1.0515185594558716, + "learning_rate": 0.0002, + "loss": 1.2361, + "step": 4160 + }, + { + "epoch": 5.589812332439678, + "grad_norm": 0.8618236184120178, + "learning_rate": 0.0002, + "loss": 1.2278, + "step": 4170 + }, + { + "epoch": 5.603217158176943, + "grad_norm": 0.9569384455680847, + "learning_rate": 0.0002, + "loss": 1.2853, + "step": 4180 + }, + { + "epoch": 5.616621983914209, + "grad_norm": 0.968923807144165, + "learning_rate": 0.0002, + "loss": 1.2824, + "step": 4190 + }, + { + "epoch": 5.630026809651475, + "grad_norm": 0.8759993314743042, + "learning_rate": 0.0002, + "loss": 1.3055, + "step": 4200 + }, + { + "epoch": 5.64343163538874, + "grad_norm": 0.9284833669662476, + "learning_rate": 0.0002, + "loss": 1.2912, + "step": 4210 + }, + { + "epoch": 5.656836461126005, + "grad_norm": 0.9293071031570435, + "learning_rate": 0.0002, + "loss": 1.2886, + "step": 4220 + }, + { + "epoch": 5.670241286863271, + "grad_norm": 0.9872161149978638, + "learning_rate": 0.0002, + "loss": 1.2704, + "step": 4230 + }, + { + "epoch": 5.683646112600536, + "grad_norm": 0.9545941948890686, + "learning_rate": 0.0002, + "loss": 1.2525, + "step": 4240 + }, + { + "epoch": 5.697050938337801, + "grad_norm": 1.0202341079711914, + "learning_rate": 0.0002, + "loss": 1.2639, + "step": 4250 + }, + { + "epoch": 5.710455764075067, + "grad_norm": 0.9821504950523376, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 4260 + }, + { + "epoch": 5.7238605898123325, + "grad_norm": 1.0581456422805786, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 4270 + }, + { + "epoch": 5.737265415549598, + "grad_norm": 0.9639395475387573, + "learning_rate": 0.0002, + "loss": 1.227, + "step": 4280 + }, + { + "epoch": 5.750670241286863, + "grad_norm": 2.205458164215088, + "learning_rate": 0.0002, + "loss": 1.2849, + "step": 4290 + }, + { + "epoch": 5.7640750670241285, + "grad_norm": 1.0294393301010132, + "learning_rate": 0.0002, + "loss": 1.2785, + "step": 4300 + }, + { + "epoch": 5.777479892761394, + "grad_norm": 1.0360256433486938, + "learning_rate": 0.0002, + "loss": 1.261, + "step": 4310 + }, + { + "epoch": 5.79088471849866, + "grad_norm": 0.9390154480934143, + "learning_rate": 0.0002, + "loss": 1.2891, + "step": 4320 + }, + { + "epoch": 5.804289544235925, + "grad_norm": 0.9048963189125061, + "learning_rate": 0.0002, + "loss": 1.248, + "step": 4330 + }, + { + "epoch": 5.81769436997319, + "grad_norm": 0.9310713410377502, + "learning_rate": 0.0002, + "loss": 1.2753, + "step": 4340 + }, + { + "epoch": 5.831099195710456, + "grad_norm": 1.038282871246338, + "learning_rate": 0.0002, + "loss": 1.2393, + "step": 4350 + }, + { + "epoch": 5.8445040214477215, + "grad_norm": 0.9194827079772949, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 4360 + }, + { + "epoch": 5.857908847184986, + "grad_norm": 0.9568411111831665, + "learning_rate": 0.0002, + "loss": 1.3049, + "step": 4370 + }, + { + "epoch": 5.871313672922252, + "grad_norm": 0.9088910818099976, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 4380 + }, + { + "epoch": 5.884718498659518, + "grad_norm": 1.0605647563934326, + "learning_rate": 0.0002, + "loss": 1.2497, + "step": 4390 + }, + { + "epoch": 5.898123324396783, + "grad_norm": 0.8016388416290283, + "learning_rate": 0.0002, + "loss": 1.2387, + "step": 4400 + }, + { + "epoch": 5.911528150134048, + "grad_norm": 1.0792853832244873, + "learning_rate": 0.0002, + "loss": 1.3046, + "step": 4410 + }, + { + "epoch": 5.924932975871314, + "grad_norm": 1.059403657913208, + "learning_rate": 0.0002, + "loss": 1.282, + "step": 4420 + }, + { + "epoch": 5.938337801608579, + "grad_norm": 0.87492436170578, + "learning_rate": 0.0002, + "loss": 1.2524, + "step": 4430 + }, + { + "epoch": 5.951742627345844, + "grad_norm": 1.0911097526550293, + "learning_rate": 0.0002, + "loss": 1.2373, + "step": 4440 + }, + { + "epoch": 5.96514745308311, + "grad_norm": 0.8860997557640076, + "learning_rate": 0.0002, + "loss": 1.3073, + "step": 4450 + }, + { + "epoch": 5.978552278820375, + "grad_norm": 0.9176826477050781, + "learning_rate": 0.0002, + "loss": 1.3273, + "step": 4460 + }, + { + "epoch": 5.991957104557641, + "grad_norm": 0.9018680453300476, + "learning_rate": 0.0002, + "loss": 1.2725, + "step": 4470 + }, + { + "epoch": 6.0, + "eval_loss": 2.0600433349609375, + "eval_runtime": 92.2728, + "eval_samples_per_second": 5.581, + "eval_steps_per_second": 0.704, + "step": 4476 + }, + { + "epoch": 6.005361930294906, + "grad_norm": 0.8612148761749268, + "learning_rate": 0.0002, + "loss": 1.2019, + "step": 4480 + }, + { + "epoch": 6.018766756032171, + "grad_norm": 1.170229434967041, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 4490 + }, + { + "epoch": 6.032171581769437, + "grad_norm": 1.1005233526229858, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 4500 + }, + { + "epoch": 6.045576407506703, + "grad_norm": 1.1763442754745483, + "learning_rate": 0.0002, + "loss": 1.0936, + "step": 4510 + }, + { + "epoch": 6.058981233243967, + "grad_norm": 1.0595353841781616, + "learning_rate": 0.0002, + "loss": 0.9865, + "step": 4520 + }, + { + "epoch": 6.072386058981233, + "grad_norm": 1.3554084300994873, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 4530 + }, + { + "epoch": 6.085790884718499, + "grad_norm": 1.238821268081665, + "learning_rate": 0.0002, + "loss": 1.0619, + "step": 4540 + }, + { + "epoch": 6.099195710455764, + "grad_norm": 1.0496071577072144, + "learning_rate": 0.0002, + "loss": 1.0951, + "step": 4550 + }, + { + "epoch": 6.112600536193029, + "grad_norm": 1.3410215377807617, + "learning_rate": 0.0002, + "loss": 1.1128, + "step": 4560 + }, + { + "epoch": 6.126005361930295, + "grad_norm": 1.2559033632278442, + "learning_rate": 0.0002, + "loss": 1.0824, + "step": 4570 + }, + { + "epoch": 6.13941018766756, + "grad_norm": 1.2556545734405518, + "learning_rate": 0.0002, + "loss": 1.0645, + "step": 4580 + }, + { + "epoch": 6.152815013404826, + "grad_norm": 1.050678014755249, + "learning_rate": 0.0002, + "loss": 1.1219, + "step": 4590 + }, + { + "epoch": 6.166219839142091, + "grad_norm": 1.566770076751709, + "learning_rate": 0.0002, + "loss": 1.0421, + "step": 4600 + }, + { + "epoch": 6.1796246648793565, + "grad_norm": 1.1482226848602295, + "learning_rate": 0.0002, + "loss": 1.0617, + "step": 4610 + }, + { + "epoch": 6.193029490616622, + "grad_norm": 1.2731150388717651, + "learning_rate": 0.0002, + "loss": 1.0477, + "step": 4620 + }, + { + "epoch": 6.206434316353888, + "grad_norm": 1.4135994911193848, + "learning_rate": 0.0002, + "loss": 1.0291, + "step": 4630 + }, + { + "epoch": 6.2198391420911525, + "grad_norm": 1.2925093173980713, + "learning_rate": 0.0002, + "loss": 1.0666, + "step": 4640 + }, + { + "epoch": 6.233243967828418, + "grad_norm": 1.1199861764907837, + "learning_rate": 0.0002, + "loss": 1.0657, + "step": 4650 + }, + { + "epoch": 6.246648793565684, + "grad_norm": 1.2010078430175781, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 4660 + }, + { + "epoch": 6.2600536193029495, + "grad_norm": 1.2655692100524902, + "learning_rate": 0.0002, + "loss": 1.1186, + "step": 4670 + }, + { + "epoch": 6.273458445040214, + "grad_norm": 1.0960880517959595, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 4680 + }, + { + "epoch": 6.28686327077748, + "grad_norm": 1.170759916305542, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 4690 + }, + { + "epoch": 6.3002680965147455, + "grad_norm": 1.1199755668640137, + "learning_rate": 0.0002, + "loss": 1.0852, + "step": 4700 + }, + { + "epoch": 6.31367292225201, + "grad_norm": 1.1477710008621216, + "learning_rate": 0.0002, + "loss": 1.0171, + "step": 4710 + }, + { + "epoch": 6.327077747989276, + "grad_norm": 1.0862090587615967, + "learning_rate": 0.0002, + "loss": 1.0411, + "step": 4720 + }, + { + "epoch": 6.340482573726542, + "grad_norm": 1.1428112983703613, + "learning_rate": 0.0002, + "loss": 1.0299, + "step": 4730 + }, + { + "epoch": 6.353887399463807, + "grad_norm": 1.155534029006958, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 4740 + }, + { + "epoch": 6.367292225201073, + "grad_norm": 1.2997788190841675, + "learning_rate": 0.0002, + "loss": 1.1134, + "step": 4750 + }, + { + "epoch": 6.380697050938338, + "grad_norm": 1.1087043285369873, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 4760 + }, + { + "epoch": 6.394101876675603, + "grad_norm": 1.3957210779190063, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 4770 + }, + { + "epoch": 6.407506702412869, + "grad_norm": 1.1346395015716553, + "learning_rate": 0.0002, + "loss": 1.0803, + "step": 4780 + }, + { + "epoch": 6.420911528150134, + "grad_norm": 1.3830486536026, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 4790 + }, + { + "epoch": 6.434316353887399, + "grad_norm": 1.1137559413909912, + "learning_rate": 0.0002, + "loss": 1.138, + "step": 4800 + }, + { + "epoch": 6.447721179624665, + "grad_norm": 1.151821494102478, + "learning_rate": 0.0002, + "loss": 1.0863, + "step": 4810 + }, + { + "epoch": 6.461126005361931, + "grad_norm": 1.122589111328125, + "learning_rate": 0.0002, + "loss": 1.0821, + "step": 4820 + }, + { + "epoch": 6.474530831099195, + "grad_norm": 1.2847239971160889, + "learning_rate": 0.0002, + "loss": 1.1308, + "step": 4830 + }, + { + "epoch": 6.487935656836461, + "grad_norm": 1.027617335319519, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 4840 + }, + { + "epoch": 6.501340482573727, + "grad_norm": 1.3375194072723389, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 4850 + }, + { + "epoch": 6.514745308310992, + "grad_norm": 1.1723220348358154, + "learning_rate": 0.0002, + "loss": 1.1055, + "step": 4860 + }, + { + "epoch": 6.528150134048257, + "grad_norm": 1.7034224271774292, + "learning_rate": 0.0002, + "loss": 1.129, + "step": 4870 + }, + { + "epoch": 6.541554959785523, + "grad_norm": 1.0840927362442017, + "learning_rate": 0.0002, + "loss": 1.0544, + "step": 4880 + }, + { + "epoch": 6.554959785522788, + "grad_norm": 1.3088481426239014, + "learning_rate": 0.0002, + "loss": 1.1194, + "step": 4890 + }, + { + "epoch": 6.568364611260054, + "grad_norm": 1.1394107341766357, + "learning_rate": 0.0002, + "loss": 1.1513, + "step": 4900 + }, + { + "epoch": 6.581769436997319, + "grad_norm": 1.0243184566497803, + "learning_rate": 0.0002, + "loss": 1.0796, + "step": 4910 + }, + { + "epoch": 6.595174262734584, + "grad_norm": 1.0814571380615234, + "learning_rate": 0.0002, + "loss": 1.2096, + "step": 4920 + }, + { + "epoch": 6.60857908847185, + "grad_norm": 1.1652323007583618, + "learning_rate": 0.0002, + "loss": 1.1279, + "step": 4930 + }, + { + "epoch": 6.621983914209116, + "grad_norm": 1.0203579664230347, + "learning_rate": 0.0002, + "loss": 1.186, + "step": 4940 + }, + { + "epoch": 6.6353887399463805, + "grad_norm": 1.3823212385177612, + "learning_rate": 0.0002, + "loss": 1.1243, + "step": 4950 + }, + { + "epoch": 6.648793565683646, + "grad_norm": 1.248955488204956, + "learning_rate": 0.0002, + "loss": 1.1464, + "step": 4960 + }, + { + "epoch": 6.662198391420912, + "grad_norm": 1.2215739488601685, + "learning_rate": 0.0002, + "loss": 1.1278, + "step": 4970 + }, + { + "epoch": 6.6756032171581765, + "grad_norm": 1.307869553565979, + "learning_rate": 0.0002, + "loss": 1.1109, + "step": 4980 + }, + { + "epoch": 6.689008042895442, + "grad_norm": 1.4434916973114014, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 4990 + }, + { + "epoch": 6.702412868632708, + "grad_norm": 1.1840227842330933, + "learning_rate": 0.0002, + "loss": 1.1068, + "step": 5000 + }, + { + "epoch": 6.7158176943699734, + "grad_norm": 1.1775435209274292, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 5010 + }, + { + "epoch": 6.729222520107239, + "grad_norm": 1.1639968156814575, + "learning_rate": 0.0002, + "loss": 1.114, + "step": 5020 + }, + { + "epoch": 6.742627345844504, + "grad_norm": 1.3774648904800415, + "learning_rate": 0.0002, + "loss": 1.1363, + "step": 5030 + }, + { + "epoch": 6.7560321715817695, + "grad_norm": 1.0328693389892578, + "learning_rate": 0.0002, + "loss": 1.095, + "step": 5040 + }, + { + "epoch": 6.769436997319035, + "grad_norm": 1.0495599508285522, + "learning_rate": 0.0002, + "loss": 1.1371, + "step": 5050 + }, + { + "epoch": 6.7828418230563, + "grad_norm": 1.3220133781433105, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 5060 + }, + { + "epoch": 6.7962466487935655, + "grad_norm": 1.3658279180526733, + "learning_rate": 0.0002, + "loss": 1.13, + "step": 5070 + }, + { + "epoch": 6.809651474530831, + "grad_norm": 1.3788504600524902, + "learning_rate": 0.0002, + "loss": 1.0755, + "step": 5080 + }, + { + "epoch": 6.823056300268097, + "grad_norm": 1.2342770099639893, + "learning_rate": 0.0002, + "loss": 1.1331, + "step": 5090 + }, + { + "epoch": 6.836461126005362, + "grad_norm": 1.3752578496932983, + "learning_rate": 0.0002, + "loss": 1.1761, + "step": 5100 + }, + { + "epoch": 6.849865951742627, + "grad_norm": 1.0902243852615356, + "learning_rate": 0.0002, + "loss": 1.078, + "step": 5110 + }, + { + "epoch": 6.863270777479893, + "grad_norm": 1.2125890254974365, + "learning_rate": 0.0002, + "loss": 1.1613, + "step": 5120 + }, + { + "epoch": 6.8766756032171585, + "grad_norm": 1.2979270219802856, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 5130 + }, + { + "epoch": 6.890080428954423, + "grad_norm": 1.2894749641418457, + "learning_rate": 0.0002, + "loss": 1.1207, + "step": 5140 + }, + { + "epoch": 6.903485254691689, + "grad_norm": 1.4804800748825073, + "learning_rate": 0.0002, + "loss": 1.1143, + "step": 5150 + }, + { + "epoch": 6.916890080428955, + "grad_norm": 1.1119170188903809, + "learning_rate": 0.0002, + "loss": 1.1245, + "step": 5160 + }, + { + "epoch": 6.930294906166219, + "grad_norm": 1.4991406202316284, + "learning_rate": 0.0002, + "loss": 1.1135, + "step": 5170 + }, + { + "epoch": 6.943699731903485, + "grad_norm": 1.2187672853469849, + "learning_rate": 0.0002, + "loss": 1.1025, + "step": 5180 + }, + { + "epoch": 6.957104557640751, + "grad_norm": 1.2419520616531372, + "learning_rate": 0.0002, + "loss": 1.1991, + "step": 5190 + }, + { + "epoch": 6.970509383378016, + "grad_norm": 1.359859585762024, + "learning_rate": 0.0002, + "loss": 1.1231, + "step": 5200 + }, + { + "epoch": 6.983914209115282, + "grad_norm": 1.3679486513137817, + "learning_rate": 0.0002, + "loss": 1.0882, + "step": 5210 + }, + { + "epoch": 6.997319034852547, + "grad_norm": 1.2109483480453491, + "learning_rate": 0.0002, + "loss": 1.1856, + "step": 5220 + }, + { + "epoch": 7.0, + "eval_loss": 2.194319725036621, + "eval_runtime": 93.0187, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 0.699, + "step": 5222 + }, + { + "epoch": 7.010723860589812, + "grad_norm": 1.1413990259170532, + "learning_rate": 0.0002, + "loss": 0.9569, + "step": 5230 + }, + { + "epoch": 7.024128686327078, + "grad_norm": 1.228061556816101, + "learning_rate": 0.0002, + "loss": 0.8378, + "step": 5240 + }, + { + "epoch": 7.037533512064343, + "grad_norm": 1.4723389148712158, + "learning_rate": 0.0002, + "loss": 0.945, + "step": 5250 + }, + { + "epoch": 7.050938337801608, + "grad_norm": 1.6016414165496826, + "learning_rate": 0.0002, + "loss": 0.9419, + "step": 5260 + }, + { + "epoch": 7.064343163538874, + "grad_norm": 1.173973798751831, + "learning_rate": 0.0002, + "loss": 0.8133, + "step": 5270 + }, + { + "epoch": 7.07774798927614, + "grad_norm": 1.7001465559005737, + "learning_rate": 0.0002, + "loss": 0.9426, + "step": 5280 + }, + { + "epoch": 7.091152815013404, + "grad_norm": 1.5025922060012817, + "learning_rate": 0.0002, + "loss": 0.9189, + "step": 5290 + }, + { + "epoch": 7.10455764075067, + "grad_norm": 1.3865472078323364, + "learning_rate": 0.0002, + "loss": 0.9106, + "step": 5300 + }, + { + "epoch": 7.117962466487936, + "grad_norm": 1.4111610651016235, + "learning_rate": 0.0002, + "loss": 0.9039, + "step": 5310 + }, + { + "epoch": 7.131367292225201, + "grad_norm": 1.3427162170410156, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 5320 + }, + { + "epoch": 7.144772117962466, + "grad_norm": 1.592889428138733, + "learning_rate": 0.0002, + "loss": 0.9665, + "step": 5330 + }, + { + "epoch": 7.158176943699732, + "grad_norm": 1.2716485261917114, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 5340 + }, + { + "epoch": 7.171581769436997, + "grad_norm": 1.3858015537261963, + "learning_rate": 0.0002, + "loss": 0.932, + "step": 5350 + }, + { + "epoch": 7.184986595174263, + "grad_norm": 1.4250117540359497, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 5360 + }, + { + "epoch": 7.198391420911528, + "grad_norm": 1.5094358921051025, + "learning_rate": 0.0002, + "loss": 0.908, + "step": 5370 + }, + { + "epoch": 7.2117962466487935, + "grad_norm": 1.299795150756836, + "learning_rate": 0.0002, + "loss": 0.9656, + "step": 5380 + }, + { + "epoch": 7.225201072386059, + "grad_norm": 1.4491885900497437, + "learning_rate": 0.0002, + "loss": 0.9416, + "step": 5390 + }, + { + "epoch": 7.238605898123325, + "grad_norm": 1.4907571077346802, + "learning_rate": 0.0002, + "loss": 0.9136, + "step": 5400 + }, + { + "epoch": 7.2520107238605895, + "grad_norm": 1.3086504936218262, + "learning_rate": 0.0002, + "loss": 0.8973, + "step": 5410 + }, + { + "epoch": 7.265415549597855, + "grad_norm": 1.2242939472198486, + "learning_rate": 0.0002, + "loss": 0.984, + "step": 5420 + }, + { + "epoch": 7.278820375335121, + "grad_norm": 1.4723531007766724, + "learning_rate": 0.0002, + "loss": 0.9271, + "step": 5430 + }, + { + "epoch": 7.292225201072386, + "grad_norm": 1.3514219522476196, + "learning_rate": 0.0002, + "loss": 0.9531, + "step": 5440 + }, + { + "epoch": 7.305630026809651, + "grad_norm": 1.484549641609192, + "learning_rate": 0.0002, + "loss": 1.0067, + "step": 5450 + }, + { + "epoch": 7.319034852546917, + "grad_norm": 1.4641015529632568, + "learning_rate": 0.0002, + "loss": 0.9408, + "step": 5460 + }, + { + "epoch": 7.3324396782841825, + "grad_norm": 1.4476960897445679, + "learning_rate": 0.0002, + "loss": 0.946, + "step": 5470 + }, + { + "epoch": 7.345844504021448, + "grad_norm": 1.5155150890350342, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 5480 + }, + { + "epoch": 7.359249329758713, + "grad_norm": 1.4297741651535034, + "learning_rate": 0.0002, + "loss": 0.9945, + "step": 5490 + }, + { + "epoch": 7.372654155495979, + "grad_norm": 1.5957597494125366, + "learning_rate": 0.0002, + "loss": 0.9897, + "step": 5500 + }, + { + "epoch": 7.386058981233244, + "grad_norm": 1.4234981536865234, + "learning_rate": 0.0002, + "loss": 0.9501, + "step": 5510 + }, + { + "epoch": 7.399463806970509, + "grad_norm": 1.4279195070266724, + "learning_rate": 0.0002, + "loss": 0.9248, + "step": 5520 + }, + { + "epoch": 7.412868632707775, + "grad_norm": 1.2789702415466309, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 5530 + }, + { + "epoch": 7.42627345844504, + "grad_norm": 1.3967640399932861, + "learning_rate": 0.0002, + "loss": 0.9614, + "step": 5540 + }, + { + "epoch": 7.439678284182306, + "grad_norm": 1.4384145736694336, + "learning_rate": 0.0002, + "loss": 0.9622, + "step": 5550 + }, + { + "epoch": 7.453083109919571, + "grad_norm": 1.2486642599105835, + "learning_rate": 0.0002, + "loss": 0.8888, + "step": 5560 + }, + { + "epoch": 7.466487935656836, + "grad_norm": 1.433598279953003, + "learning_rate": 0.0002, + "loss": 0.9768, + "step": 5570 + }, + { + "epoch": 7.479892761394102, + "grad_norm": 1.2411381006240845, + "learning_rate": 0.0002, + "loss": 0.9954, + "step": 5580 + }, + { + "epoch": 7.493297587131368, + "grad_norm": 1.5211423635482788, + "learning_rate": 0.0002, + "loss": 1.0025, + "step": 5590 + }, + { + "epoch": 7.506702412868632, + "grad_norm": 1.916807770729065, + "learning_rate": 0.0002, + "loss": 0.996, + "step": 5600 + }, + { + "epoch": 7.520107238605898, + "grad_norm": 1.1726218461990356, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 5610 + }, + { + "epoch": 7.533512064343164, + "grad_norm": 1.4437224864959717, + "learning_rate": 0.0002, + "loss": 0.9693, + "step": 5620 + }, + { + "epoch": 7.546916890080429, + "grad_norm": 1.3450417518615723, + "learning_rate": 0.0002, + "loss": 0.9574, + "step": 5630 + }, + { + "epoch": 7.560321715817694, + "grad_norm": 1.369955062866211, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 5640 + }, + { + "epoch": 7.57372654155496, + "grad_norm": 1.323500394821167, + "learning_rate": 0.0002, + "loss": 0.985, + "step": 5650 + }, + { + "epoch": 7.587131367292225, + "grad_norm": 1.4024254083633423, + "learning_rate": 0.0002, + "loss": 0.9351, + "step": 5660 + }, + { + "epoch": 7.600536193029491, + "grad_norm": 1.5177226066589355, + "learning_rate": 0.0002, + "loss": 0.9277, + "step": 5670 + }, + { + "epoch": 7.613941018766756, + "grad_norm": 1.3379560708999634, + "learning_rate": 0.0002, + "loss": 1.0089, + "step": 5680 + }, + { + "epoch": 7.627345844504021, + "grad_norm": 1.3165442943572998, + "learning_rate": 0.0002, + "loss": 0.9919, + "step": 5690 + }, + { + "epoch": 7.640750670241287, + "grad_norm": 1.4175701141357422, + "learning_rate": 0.0002, + "loss": 1.024, + "step": 5700 + }, + { + "epoch": 7.654155495978552, + "grad_norm": 1.531698226928711, + "learning_rate": 0.0002, + "loss": 0.9237, + "step": 5710 + }, + { + "epoch": 7.6675603217158175, + "grad_norm": 1.3139971494674683, + "learning_rate": 0.0002, + "loss": 1.0119, + "step": 5720 + }, + { + "epoch": 7.680965147453083, + "grad_norm": 1.4163814783096313, + "learning_rate": 0.0002, + "loss": 0.9301, + "step": 5730 + }, + { + "epoch": 7.694369973190349, + "grad_norm": 1.4500303268432617, + "learning_rate": 0.0002, + "loss": 0.9794, + "step": 5740 + }, + { + "epoch": 7.707774798927614, + "grad_norm": 1.2513974905014038, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 5750 + }, + { + "epoch": 7.721179624664879, + "grad_norm": 1.6025257110595703, + "learning_rate": 0.0002, + "loss": 0.9432, + "step": 5760 + }, + { + "epoch": 7.734584450402145, + "grad_norm": 1.4038569927215576, + "learning_rate": 0.0002, + "loss": 0.9981, + "step": 5770 + }, + { + "epoch": 7.7479892761394105, + "grad_norm": 1.464080572128296, + "learning_rate": 0.0002, + "loss": 0.9435, + "step": 5780 + }, + { + "epoch": 7.761394101876675, + "grad_norm": 1.51055908203125, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 5790 + }, + { + "epoch": 7.774798927613941, + "grad_norm": 1.4638031721115112, + "learning_rate": 0.0002, + "loss": 0.9887, + "step": 5800 + }, + { + "epoch": 7.7882037533512065, + "grad_norm": 1.274057388305664, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 5810 + }, + { + "epoch": 7.801608579088472, + "grad_norm": 1.4633456468582153, + "learning_rate": 0.0002, + "loss": 0.9756, + "step": 5820 + }, + { + "epoch": 7.815013404825737, + "grad_norm": 1.3144497871398926, + "learning_rate": 0.0002, + "loss": 1.0536, + "step": 5830 + }, + { + "epoch": 7.828418230563003, + "grad_norm": 1.496511459350586, + "learning_rate": 0.0002, + "loss": 1.0058, + "step": 5840 + }, + { + "epoch": 7.841823056300268, + "grad_norm": 1.603127360343933, + "learning_rate": 0.0002, + "loss": 1.0064, + "step": 5850 + }, + { + "epoch": 7.855227882037534, + "grad_norm": 1.376160979270935, + "learning_rate": 0.0002, + "loss": 1.0116, + "step": 5860 + }, + { + "epoch": 7.868632707774799, + "grad_norm": 1.9300047159194946, + "learning_rate": 0.0002, + "loss": 1.0103, + "step": 5870 + }, + { + "epoch": 7.882037533512064, + "grad_norm": 1.5328046083450317, + "learning_rate": 0.0002, + "loss": 1.044, + "step": 5880 + }, + { + "epoch": 7.89544235924933, + "grad_norm": 1.4844473600387573, + "learning_rate": 0.0002, + "loss": 1.022, + "step": 5890 + }, + { + "epoch": 7.908847184986596, + "grad_norm": 1.3647412061691284, + "learning_rate": 0.0002, + "loss": 1.0594, + "step": 5900 + }, + { + "epoch": 7.92225201072386, + "grad_norm": 1.4157295227050781, + "learning_rate": 0.0002, + "loss": 0.9822, + "step": 5910 + }, + { + "epoch": 7.935656836461126, + "grad_norm": 1.4677143096923828, + "learning_rate": 0.0002, + "loss": 0.9722, + "step": 5920 + }, + { + "epoch": 7.949061662198392, + "grad_norm": 1.322703242301941, + "learning_rate": 0.0002, + "loss": 0.9871, + "step": 5930 + }, + { + "epoch": 7.962466487935657, + "grad_norm": 1.1980623006820679, + "learning_rate": 0.0002, + "loss": 1.0684, + "step": 5940 + }, + { + "epoch": 7.975871313672922, + "grad_norm": 1.3701993227005005, + "learning_rate": 0.0002, + "loss": 0.9723, + "step": 5950 + }, + { + "epoch": 7.989276139410188, + "grad_norm": 1.4934145212173462, + "learning_rate": 0.0002, + "loss": 1.0442, + "step": 5960 + }, + { + "epoch": 8.0, + "eval_loss": 2.3211426734924316, + "eval_runtime": 92.2399, + "eval_samples_per_second": 5.583, + "eval_steps_per_second": 0.705, + "step": 5968 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.76185677106774e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-5968/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/README.md b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d63cb87eaccf2d81de3cdcfa11d2e99c440c0ea0 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a44fb8010b7504af70a6c5ae37a91ea5f51437ac --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95936535a1b252581f2b1e5886ff8d2ccbd864bf4e3475341cef91fedaa58a29 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7661608f99ca4cf628f8e912ca3dc10ef30e45bb --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eb65f7c65425432aabada698aaaf8525c59ed1eb30ae67536dfe0acd2cdf7f2 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..038f814386564003ee94c202f343c0806e35be92 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ebdeae06c5250472d01acea4d98e61d73e7e861d4bb758a8a1b0edeff7d6754 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..edfa27a60777302c74400a7c3d2dc7655a18476d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:019f65dab10c8c0bf1492ed27750bfb79d0fc71e6b461141379d2f4f9c78db3d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..836a1878f05dbfef1db1159b75eacef30da5dcb3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/trainer_state.json @@ -0,0 +1,559 @@ +{ + "best_metric": 1.8168668746948242, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 746, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013404825737265416, + "grad_norm": 0.5006060004234314, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 10 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 0.895697832107544, + "learning_rate": 0.0002, + "loss": 2.2758, + "step": 20 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 0.4904654324054718, + "learning_rate": 0.0002, + "loss": 2.1106, + "step": 30 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 0.5587937831878662, + "learning_rate": 0.0002, + "loss": 1.9964, + "step": 40 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 0.46309754252433777, + "learning_rate": 0.0002, + "loss": 1.9997, + "step": 50 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 0.46663302183151245, + "learning_rate": 0.0002, + "loss": 1.9512, + "step": 60 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 0.6435502171516418, + "learning_rate": 0.0002, + "loss": 1.845, + "step": 70 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.46288377046585083, + "learning_rate": 0.0002, + "loss": 1.8528, + "step": 80 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.5226837396621704, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 90 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 1.190576195716858, + "learning_rate": 0.0002, + "loss": 1.8706, + "step": 100 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.4229426980018616, + "learning_rate": 0.0002, + "loss": 1.8465, + "step": 110 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.7448789477348328, + "learning_rate": 0.0002, + "loss": 1.8933, + "step": 120 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.3955472409725189, + "learning_rate": 0.0002, + "loss": 1.8377, + "step": 130 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.4333747327327728, + "learning_rate": 0.0002, + "loss": 1.8731, + "step": 140 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.4262531101703644, + "learning_rate": 0.0002, + "loss": 1.9102, + "step": 150 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.44875991344451904, + "learning_rate": 0.0002, + "loss": 1.8525, + "step": 160 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.39748692512512207, + "learning_rate": 0.0002, + "loss": 1.8104, + "step": 170 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.3995216488838196, + "learning_rate": 0.0002, + "loss": 1.8956, + "step": 180 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.4942905902862549, + "learning_rate": 0.0002, + "loss": 1.8166, + "step": 190 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.5456372499465942, + "learning_rate": 0.0002, + "loss": 1.8784, + "step": 200 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.42792096734046936, + "learning_rate": 0.0002, + "loss": 1.8204, + "step": 210 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.5114870667457581, + "learning_rate": 0.0002, + "loss": 1.8034, + "step": 220 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.41311749815940857, + "learning_rate": 0.0002, + "loss": 1.7965, + "step": 230 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.39651045203208923, + "learning_rate": 0.0002, + "loss": 1.8193, + "step": 240 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.3648274540901184, + "learning_rate": 0.0002, + "loss": 1.8806, + "step": 250 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.3815963566303253, + "learning_rate": 0.0002, + "loss": 1.7645, + "step": 260 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.4006984531879425, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 270 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.4043481647968292, + "learning_rate": 0.0002, + "loss": 1.8459, + "step": 280 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.37889420986175537, + "learning_rate": 0.0002, + "loss": 1.8551, + "step": 290 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.34378889203071594, + "learning_rate": 0.0002, + "loss": 1.8094, + "step": 300 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.3695462644100189, + "learning_rate": 0.0002, + "loss": 1.7489, + "step": 310 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.3820156753063202, + "learning_rate": 0.0002, + "loss": 1.7838, + "step": 320 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.4782438576221466, + "learning_rate": 0.0002, + "loss": 1.8432, + "step": 330 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.34293901920318604, + "learning_rate": 0.0002, + "loss": 1.8114, + "step": 340 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.34477704763412476, + "learning_rate": 0.0002, + "loss": 1.8255, + "step": 350 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.372482031583786, + "learning_rate": 0.0002, + "loss": 1.7518, + "step": 360 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.37152206897735596, + "learning_rate": 0.0002, + "loss": 1.7949, + "step": 370 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.3464239537715912, + "learning_rate": 0.0002, + "loss": 1.8622, + "step": 380 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.3936820328235626, + "learning_rate": 0.0002, + "loss": 1.7986, + "step": 390 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.4001905620098114, + "learning_rate": 0.0002, + "loss": 1.8422, + "step": 400 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.3600618243217468, + "learning_rate": 0.0002, + "loss": 1.889, + "step": 410 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.3735682964324951, + "learning_rate": 0.0002, + "loss": 1.7667, + "step": 420 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.34881851077079773, + "learning_rate": 0.0002, + "loss": 1.8039, + "step": 430 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.3512067496776581, + "learning_rate": 0.0002, + "loss": 1.8438, + "step": 440 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.42287155985832214, + "learning_rate": 0.0002, + "loss": 1.8021, + "step": 450 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.34132200479507446, + "learning_rate": 0.0002, + "loss": 1.8818, + "step": 460 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.345334529876709, + "learning_rate": 0.0002, + "loss": 1.7515, + "step": 470 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.363789826631546, + "learning_rate": 0.0002, + "loss": 1.8632, + "step": 480 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.33300429582595825, + "learning_rate": 0.0002, + "loss": 1.7783, + "step": 490 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.4159756600856781, + "learning_rate": 0.0002, + "loss": 1.8464, + "step": 500 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.3246348798274994, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 510 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.3838692307472229, + "learning_rate": 0.0002, + "loss": 1.8568, + "step": 520 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.3381868898868561, + "learning_rate": 0.0002, + "loss": 1.8308, + "step": 530 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.34136253595352173, + "learning_rate": 0.0002, + "loss": 1.8174, + "step": 540 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.3476671576499939, + "learning_rate": 0.0002, + "loss": 1.7902, + "step": 550 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.35285887122154236, + "learning_rate": 0.0002, + "loss": 1.792, + "step": 560 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.3596920371055603, + "learning_rate": 0.0002, + "loss": 1.8588, + "step": 570 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.32715895771980286, + "learning_rate": 0.0002, + "loss": 1.8762, + "step": 580 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.34543490409851074, + "learning_rate": 0.0002, + "loss": 1.7703, + "step": 590 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.37439998984336853, + "learning_rate": 0.0002, + "loss": 1.747, + "step": 600 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.3491382300853729, + "learning_rate": 0.0002, + "loss": 1.8243, + "step": 610 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.34014254808425903, + "learning_rate": 0.0002, + "loss": 1.8925, + "step": 620 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.3297452926635742, + "learning_rate": 0.0002, + "loss": 1.7386, + "step": 630 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.3458525538444519, + "learning_rate": 0.0002, + "loss": 1.7946, + "step": 640 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.3545733392238617, + "learning_rate": 0.0002, + "loss": 1.7439, + "step": 650 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.3864935040473938, + "learning_rate": 0.0002, + "loss": 1.7753, + "step": 660 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.35447531938552856, + "learning_rate": 0.0002, + "loss": 1.9012, + "step": 670 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.32028648257255554, + "learning_rate": 0.0002, + "loss": 1.8019, + "step": 680 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.36557647585868835, + "learning_rate": 0.0002, + "loss": 1.7813, + "step": 690 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.3581075072288513, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 700 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.3576897978782654, + "learning_rate": 0.0002, + "loss": 1.7897, + "step": 710 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.33551549911499023, + "learning_rate": 0.0002, + "loss": 1.7086, + "step": 720 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.39297860860824585, + "learning_rate": 0.0002, + "loss": 1.6907, + "step": 730 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.3467773199081421, + "learning_rate": 0.0002, + "loss": 1.7941, + "step": 740 + }, + { + "epoch": 1.0, + "eval_loss": 1.8168668746948242, + "eval_runtime": 90.6336, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.717, + "step": 746 + } + ], + "logging_steps": 10, + "max_steps": 5968, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.452320963834675e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c1e6edc382a8cf8ffbc8d6b6a971b2c83ddfa661 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dfe2b0102b1ecd4dadeb818e314fee7d5fb2a15887cb362c36bc44960b3b0 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..45e2550d04c47f6af3b042a926f3b56075df716e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 746, "epoch_duration": 1998.8285369873047, "total_accumulated_duration": 1998.8285369873047, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}]} +{"epoch": 2.0, "step": 1492, "epoch_duration": 2072.8146653175354, "total_accumulated_duration": 4071.64320230484, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 14256.0}, "peak_memory_reserved": {"GPU_0": 15414.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-746", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}, {"eval_loss": 1.8168668746948242, "eval_runtime": 90.6336, "eval_samples_per_second": 5.682, "eval_steps_per_second": 0.717, "epoch": 1.0, "step": 746}, {"loss": 1.7741, "grad_norm": 0.2998153269290924, "learning_rate": 0.0002, "epoch": 1.0053619302949062, "step": 750}, {"loss": 1.7897, "grad_norm": 0.34353747963905334, "learning_rate": 0.0002, "epoch": 1.0187667560321716, "step": 760}, {"loss": 1.6997, "grad_norm": 0.3506847321987152, "learning_rate": 0.0002, "epoch": 1.032171581769437, "step": 770}, {"loss": 1.7277, "grad_norm": 0.3434218764305115, "learning_rate": 0.0002, "epoch": 1.0455764075067024, "step": 780}, {"loss": 1.7201, "grad_norm": 0.39283573627471924, "learning_rate": 0.0002, "epoch": 1.0589812332439679, "step": 790}, {"loss": 1.7134, "grad_norm": 0.36534103751182556, "learning_rate": 0.0002, "epoch": 1.0723860589812333, "step": 800}, {"loss": 1.73, "grad_norm": 0.32713210582733154, "learning_rate": 0.0002, "epoch": 1.0857908847184987, "step": 810}, {"loss": 1.733, "grad_norm": 0.4298870861530304, "learning_rate": 0.0002, "epoch": 1.0991957104557641, "step": 820}, {"loss": 1.7152, "grad_norm": 0.3652895987033844, "learning_rate": 0.0002, "epoch": 1.1126005361930296, "step": 830}, {"loss": 1.7952, "grad_norm": 0.4341593086719513, "learning_rate": 0.0002, "epoch": 1.126005361930295, "step": 840}, {"loss": 1.7353, "grad_norm": 0.3925093412399292, "learning_rate": 0.0002, "epoch": 1.1394101876675604, "step": 850}, {"loss": 1.7484, "grad_norm": 0.3695056736469269, "learning_rate": 0.0002, "epoch": 1.1528150134048256, "step": 860}, {"loss": 1.7959, "grad_norm": 0.36138468980789185, "learning_rate": 0.0002, "epoch": 1.1662198391420913, "step": 870}, {"loss": 1.7144, "grad_norm": 0.33074072003364563, "learning_rate": 0.0002, "epoch": 1.1796246648793565, "step": 880}, {"loss": 1.7303, "grad_norm": 0.3552579879760742, "learning_rate": 0.0002, "epoch": 1.193029490616622, "step": 890}, {"loss": 1.6857, "grad_norm": 0.38744238018989563, "learning_rate": 0.0002, "epoch": 1.2064343163538873, "step": 900}, {"loss": 1.7543, "grad_norm": 0.3563305735588074, "learning_rate": 0.0002, "epoch": 1.2198391420911527, "step": 910}, {"loss": 1.7406, "grad_norm": 0.35686084628105164, "learning_rate": 0.0002, "epoch": 1.2332439678284182, "step": 920}, {"loss": 1.765, "grad_norm": 0.4001927077770233, "learning_rate": 0.0002, "epoch": 1.2466487935656836, "step": 930}, {"loss": 1.7147, "grad_norm": 0.35909149050712585, "learning_rate": 0.0002, "epoch": 1.260053619302949, "step": 940}, {"loss": 1.6712, "grad_norm": 0.35123375058174133, "learning_rate": 0.0002, "epoch": 1.2734584450402144, "step": 950}, {"loss": 1.7245, "grad_norm": 0.38013333082199097, "learning_rate": 0.0002, "epoch": 1.2868632707774799, "step": 960}, {"loss": 1.7395, "grad_norm": 0.373146653175354, "learning_rate": 0.0002, "epoch": 1.3002680965147453, "step": 970}, {"loss": 1.707, "grad_norm": 0.4208183288574219, "learning_rate": 0.0002, "epoch": 1.3136729222520107, "step": 980}, {"loss": 1.7122, "grad_norm": 0.3613564074039459, "learning_rate": 0.0002, "epoch": 1.3270777479892761, "step": 990}, {"loss": 1.6776, "grad_norm": 0.34058499336242676, "learning_rate": 0.0002, "epoch": 1.3404825737265416, "step": 1000}, {"loss": 1.7072, "grad_norm": 0.3563075065612793, "learning_rate": 0.0002, "epoch": 1.353887399463807, "step": 1010}, {"loss": 1.7167, "grad_norm": 0.36920854449272156, "learning_rate": 0.0002, "epoch": 1.3672922252010724, "step": 1020}, {"loss": 1.7143, "grad_norm": 0.3889519274234772, "learning_rate": 0.0002, "epoch": 1.3806970509383378, "step": 1030}, {"loss": 1.8023, "grad_norm": 0.3664555251598358, "learning_rate": 0.0002, "epoch": 1.3941018766756033, "step": 1040}, {"loss": 1.7961, "grad_norm": 0.38175567984580994, "learning_rate": 0.0002, "epoch": 1.4075067024128687, "step": 1050}, {"loss": 1.7363, "grad_norm": 0.42346763610839844, "learning_rate": 0.0002, "epoch": 1.420911528150134, "step": 1060}, {"loss": 1.708, "grad_norm": 0.3456033170223236, "learning_rate": 0.0002, "epoch": 1.4343163538873995, "step": 1070}, {"loss": 1.6846, "grad_norm": 0.38931941986083984, "learning_rate": 0.0002, "epoch": 1.447721179624665, "step": 1080}, {"loss": 1.7416, "grad_norm": 0.5473279356956482, "learning_rate": 0.0002, "epoch": 1.4611260053619302, "step": 1090}, {"loss": 1.6927, "grad_norm": 0.3517422676086426, "learning_rate": 0.0002, "epoch": 1.4745308310991958, "step": 1100}, {"loss": 1.7213, "grad_norm": 0.3511943221092224, "learning_rate": 0.0002, "epoch": 1.487935656836461, "step": 1110}, {"loss": 1.7947, "grad_norm": 0.3762837052345276, "learning_rate": 0.0002, "epoch": 1.5013404825737267, "step": 1120}, {"loss": 1.6893, "grad_norm": 0.37149128317832947, "learning_rate": 0.0002, "epoch": 1.5147453083109919, "step": 1130}, {"loss": 1.6944, "grad_norm": 0.3945842981338501, "learning_rate": 0.0002, "epoch": 1.5281501340482575, "step": 1140}, {"loss": 1.7254, "grad_norm": 0.40258195996284485, "learning_rate": 0.0002, "epoch": 1.5415549597855227, "step": 1150}, {"loss": 1.6798, "grad_norm": 0.3959120213985443, "learning_rate": 0.0002, "epoch": 1.5549597855227884, "step": 1160}, {"loss": 1.7789, "grad_norm": 0.37792712450027466, "learning_rate": 0.0002, "epoch": 1.5683646112600536, "step": 1170}, {"loss": 1.7953, "grad_norm": 0.4019201099872589, "learning_rate": 0.0002, "epoch": 1.5817694369973192, "step": 1180}, {"loss": 1.6887, "grad_norm": 0.40712273120880127, "learning_rate": 0.0002, "epoch": 1.5951742627345844, "step": 1190}, {"loss": 1.7131, "grad_norm": 0.4131423234939575, "learning_rate": 0.0002, "epoch": 1.6085790884718498, "step": 1200}, {"loss": 1.6757, "grad_norm": 0.3738194704055786, "learning_rate": 0.0002, "epoch": 1.6219839142091153, "step": 1210}, {"loss": 1.7629, "grad_norm": 0.3987765908241272, "learning_rate": 0.0002, "epoch": 1.6353887399463807, "step": 1220}, {"loss": 1.7374, "grad_norm": 0.34117406606674194, "learning_rate": 0.0002, "epoch": 1.648793565683646, "step": 1230}, {"loss": 1.7869, "grad_norm": 0.34900516271591187, "learning_rate": 0.0002, "epoch": 1.6621983914209115, "step": 1240}, {"loss": 1.7162, "grad_norm": 0.35759788751602173, "learning_rate": 0.0002, "epoch": 1.675603217158177, "step": 1250}, {"loss": 1.7697, "grad_norm": 0.3837822377681732, "learning_rate": 0.0002, "epoch": 1.6890080428954424, "step": 1260}, {"loss": 1.7972, "grad_norm": 0.3671180307865143, "learning_rate": 0.0002, "epoch": 1.7024128686327078, "step": 1270}, {"loss": 1.7198, "grad_norm": 0.4124658703804016, "learning_rate": 0.0002, "epoch": 1.7158176943699732, "step": 1280}, {"loss": 1.8006, "grad_norm": 0.39059901237487793, "learning_rate": 0.0002, "epoch": 1.7292225201072386, "step": 1290}, {"loss": 1.7721, "grad_norm": 0.4006287157535553, "learning_rate": 0.0002, "epoch": 1.742627345844504, "step": 1300}, {"loss": 1.8196, "grad_norm": 0.3606216013431549, "learning_rate": 0.0002, "epoch": 1.7560321715817695, "step": 1310}, {"loss": 1.7213, "grad_norm": 0.3861924111843109, "learning_rate": 0.0002, "epoch": 1.7694369973190347, "step": 1320}, {"loss": 1.7849, "grad_norm": 0.41432589292526245, "learning_rate": 0.0002, "epoch": 1.7828418230563003, "step": 1330}, {"loss": 1.7069, "grad_norm": 0.3751705586910248, "learning_rate": 0.0002, "epoch": 1.7962466487935655, "step": 1340}, {"loss": 1.717, "grad_norm": 0.36217355728149414, "learning_rate": 0.0002, "epoch": 1.8096514745308312, "step": 1350}, {"loss": 1.7878, "grad_norm": 0.35937434434890747, "learning_rate": 0.0002, "epoch": 1.8230563002680964, "step": 1360}, {"loss": 1.7026, "grad_norm": 0.36120304465293884, "learning_rate": 0.0002, "epoch": 1.836461126005362, "step": 1370}, {"loss": 1.7378, "grad_norm": 0.36082401871681213, "learning_rate": 0.0002, "epoch": 1.8498659517426272, "step": 1380}, {"loss": 1.6938, "grad_norm": 0.3616413176059723, "learning_rate": 0.0002, "epoch": 1.863270777479893, "step": 1390}, {"loss": 1.6998, "grad_norm": 0.3664911091327667, "learning_rate": 0.0002, "epoch": 1.876675603217158, "step": 1400}, {"loss": 1.7548, "grad_norm": 0.3545122444629669, "learning_rate": 0.0002, "epoch": 1.8900804289544237, "step": 1410}, {"loss": 1.727, "grad_norm": 0.38186976313591003, "learning_rate": 0.0002, "epoch": 1.903485254691689, "step": 1420}, {"loss": 1.788, "grad_norm": 0.41099944710731506, "learning_rate": 0.0002, "epoch": 1.9168900804289544, "step": 1430}, {"loss": 1.7377, "grad_norm": 0.34538620710372925, "learning_rate": 0.0002, "epoch": 1.9302949061662198, "step": 1440}, {"loss": 1.7349, "grad_norm": 0.35443663597106934, "learning_rate": 0.0002, "epoch": 1.9436997319034852, "step": 1450}, {"loss": 1.7457, "grad_norm": 0.4783519208431244, "learning_rate": 0.0002, "epoch": 1.9571045576407506, "step": 1460}, {"loss": 1.7073, "grad_norm": 0.36285310983657837, "learning_rate": 0.0002, "epoch": 1.970509383378016, "step": 1470}, {"loss": 1.7607, "grad_norm": 0.361730694770813, "learning_rate": 0.0002, "epoch": 1.9839142091152815, "step": 1480}, {"loss": 1.7133, "grad_norm": 0.38347867131233215, "learning_rate": 0.0002, "epoch": 1.997319034852547, "step": 1490}]} +{"epoch": 3.0, "step": 2238, "epoch_duration": 2008.9420101642609, "total_accumulated_duration": 6080.585212469101, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 14256.0}, "peak_memory_reserved": {"GPU_0": 15414.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}, {"eval_loss": 1.8168668746948242, "eval_runtime": 90.6336, "eval_samples_per_second": 5.682, "eval_steps_per_second": 0.717, "epoch": 1.0, "step": 746}, {"loss": 1.7741, "grad_norm": 0.2998153269290924, "learning_rate": 0.0002, "epoch": 1.0053619302949062, "step": 750}, {"loss": 1.7897, "grad_norm": 0.34353747963905334, "learning_rate": 0.0002, "epoch": 1.0187667560321716, "step": 760}, {"loss": 1.6997, "grad_norm": 0.3506847321987152, "learning_rate": 0.0002, "epoch": 1.032171581769437, "step": 770}, {"loss": 1.7277, "grad_norm": 0.3434218764305115, "learning_rate": 0.0002, "epoch": 1.0455764075067024, "step": 780}, {"loss": 1.7201, "grad_norm": 0.39283573627471924, "learning_rate": 0.0002, "epoch": 1.0589812332439679, "step": 790}, {"loss": 1.7134, "grad_norm": 0.36534103751182556, "learning_rate": 0.0002, "epoch": 1.0723860589812333, "step": 800}, {"loss": 1.73, "grad_norm": 0.32713210582733154, "learning_rate": 0.0002, "epoch": 1.0857908847184987, "step": 810}, {"loss": 1.733, "grad_norm": 0.4298870861530304, "learning_rate": 0.0002, "epoch": 1.0991957104557641, "step": 820}, {"loss": 1.7152, "grad_norm": 0.3652895987033844, "learning_rate": 0.0002, "epoch": 1.1126005361930296, "step": 830}, {"loss": 1.7952, "grad_norm": 0.4341593086719513, "learning_rate": 0.0002, "epoch": 1.126005361930295, "step": 840}, {"loss": 1.7353, "grad_norm": 0.3925093412399292, "learning_rate": 0.0002, "epoch": 1.1394101876675604, "step": 850}, {"loss": 1.7484, "grad_norm": 0.3695056736469269, "learning_rate": 0.0002, "epoch": 1.1528150134048256, "step": 860}, {"loss": 1.7959, "grad_norm": 0.36138468980789185, "learning_rate": 0.0002, "epoch": 1.1662198391420913, "step": 870}, {"loss": 1.7144, "grad_norm": 0.33074072003364563, "learning_rate": 0.0002, "epoch": 1.1796246648793565, "step": 880}, {"loss": 1.7303, "grad_norm": 0.3552579879760742, "learning_rate": 0.0002, "epoch": 1.193029490616622, "step": 890}, {"loss": 1.6857, "grad_norm": 0.38744238018989563, "learning_rate": 0.0002, "epoch": 1.2064343163538873, "step": 900}, {"loss": 1.7543, "grad_norm": 0.3563305735588074, "learning_rate": 0.0002, "epoch": 1.2198391420911527, "step": 910}, {"loss": 1.7406, "grad_norm": 0.35686084628105164, "learning_rate": 0.0002, "epoch": 1.2332439678284182, "step": 920}, {"loss": 1.765, "grad_norm": 0.4001927077770233, "learning_rate": 0.0002, "epoch": 1.2466487935656836, "step": 930}, {"loss": 1.7147, "grad_norm": 0.35909149050712585, "learning_rate": 0.0002, "epoch": 1.260053619302949, "step": 940}, {"loss": 1.6712, "grad_norm": 0.35123375058174133, "learning_rate": 0.0002, "epoch": 1.2734584450402144, "step": 950}, {"loss": 1.7245, "grad_norm": 0.38013333082199097, "learning_rate": 0.0002, "epoch": 1.2868632707774799, "step": 960}, {"loss": 1.7395, "grad_norm": 0.373146653175354, "learning_rate": 0.0002, "epoch": 1.3002680965147453, "step": 970}, {"loss": 1.707, "grad_norm": 0.4208183288574219, "learning_rate": 0.0002, "epoch": 1.3136729222520107, "step": 980}, {"loss": 1.7122, "grad_norm": 0.3613564074039459, "learning_rate": 0.0002, "epoch": 1.3270777479892761, "step": 990}, {"loss": 1.6776, "grad_norm": 0.34058499336242676, "learning_rate": 0.0002, "epoch": 1.3404825737265416, "step": 1000}, {"loss": 1.7072, "grad_norm": 0.3563075065612793, "learning_rate": 0.0002, "epoch": 1.353887399463807, "step": 1010}, {"loss": 1.7167, "grad_norm": 0.36920854449272156, "learning_rate": 0.0002, "epoch": 1.3672922252010724, "step": 1020}, {"loss": 1.7143, "grad_norm": 0.3889519274234772, "learning_rate": 0.0002, "epoch": 1.3806970509383378, "step": 1030}, {"loss": 1.8023, "grad_norm": 0.3664555251598358, "learning_rate": 0.0002, "epoch": 1.3941018766756033, "step": 1040}, {"loss": 1.7961, "grad_norm": 0.38175567984580994, "learning_rate": 0.0002, "epoch": 1.4075067024128687, "step": 1050}, {"loss": 1.7363, "grad_norm": 0.42346763610839844, "learning_rate": 0.0002, "epoch": 1.420911528150134, "step": 1060}, {"loss": 1.708, "grad_norm": 0.3456033170223236, "learning_rate": 0.0002, "epoch": 1.4343163538873995, "step": 1070}, {"loss": 1.6846, "grad_norm": 0.38931941986083984, "learning_rate": 0.0002, "epoch": 1.447721179624665, "step": 1080}, {"loss": 1.7416, "grad_norm": 0.5473279356956482, "learning_rate": 0.0002, "epoch": 1.4611260053619302, "step": 1090}, {"loss": 1.6927, "grad_norm": 0.3517422676086426, "learning_rate": 0.0002, "epoch": 1.4745308310991958, "step": 1100}, {"loss": 1.7213, "grad_norm": 0.3511943221092224, "learning_rate": 0.0002, "epoch": 1.487935656836461, "step": 1110}, {"loss": 1.7947, "grad_norm": 0.3762837052345276, "learning_rate": 0.0002, "epoch": 1.5013404825737267, "step": 1120}, {"loss": 1.6893, "grad_norm": 0.37149128317832947, "learning_rate": 0.0002, "epoch": 1.5147453083109919, "step": 1130}, {"loss": 1.6944, "grad_norm": 0.3945842981338501, "learning_rate": 0.0002, "epoch": 1.5281501340482575, "step": 1140}, {"loss": 1.7254, "grad_norm": 0.40258195996284485, "learning_rate": 0.0002, "epoch": 1.5415549597855227, "step": 1150}, {"loss": 1.6798, "grad_norm": 0.3959120213985443, "learning_rate": 0.0002, "epoch": 1.5549597855227884, "step": 1160}, {"loss": 1.7789, "grad_norm": 0.37792712450027466, "learning_rate": 0.0002, "epoch": 1.5683646112600536, "step": 1170}, {"loss": 1.7953, "grad_norm": 0.4019201099872589, "learning_rate": 0.0002, "epoch": 1.5817694369973192, "step": 1180}, {"loss": 1.6887, "grad_norm": 0.40712273120880127, "learning_rate": 0.0002, "epoch": 1.5951742627345844, "step": 1190}, {"loss": 1.7131, "grad_norm": 0.4131423234939575, "learning_rate": 0.0002, "epoch": 1.6085790884718498, "step": 1200}, {"loss": 1.6757, "grad_norm": 0.3738194704055786, "learning_rate": 0.0002, "epoch": 1.6219839142091153, "step": 1210}, {"loss": 1.7629, "grad_norm": 0.3987765908241272, "learning_rate": 0.0002, "epoch": 1.6353887399463807, "step": 1220}, {"loss": 1.7374, "grad_norm": 0.34117406606674194, "learning_rate": 0.0002, "epoch": 1.648793565683646, "step": 1230}, {"loss": 1.7869, "grad_norm": 0.34900516271591187, "learning_rate": 0.0002, "epoch": 1.6621983914209115, "step": 1240}, {"loss": 1.7162, "grad_norm": 0.35759788751602173, "learning_rate": 0.0002, "epoch": 1.675603217158177, "step": 1250}, {"loss": 1.7697, "grad_norm": 0.3837822377681732, "learning_rate": 0.0002, "epoch": 1.6890080428954424, "step": 1260}, {"loss": 1.7972, "grad_norm": 0.3671180307865143, "learning_rate": 0.0002, "epoch": 1.7024128686327078, "step": 1270}, {"loss": 1.7198, "grad_norm": 0.4124658703804016, "learning_rate": 0.0002, "epoch": 1.7158176943699732, "step": 1280}, {"loss": 1.8006, "grad_norm": 0.39059901237487793, "learning_rate": 0.0002, "epoch": 1.7292225201072386, "step": 1290}, {"loss": 1.7721, "grad_norm": 0.4006287157535553, "learning_rate": 0.0002, "epoch": 1.742627345844504, "step": 1300}, {"loss": 1.8196, "grad_norm": 0.3606216013431549, "learning_rate": 0.0002, "epoch": 1.7560321715817695, "step": 1310}, {"loss": 1.7213, "grad_norm": 0.3861924111843109, "learning_rate": 0.0002, "epoch": 1.7694369973190347, "step": 1320}, {"loss": 1.7849, "grad_norm": 0.41432589292526245, "learning_rate": 0.0002, "epoch": 1.7828418230563003, "step": 1330}, {"loss": 1.7069, "grad_norm": 0.3751705586910248, "learning_rate": 0.0002, "epoch": 1.7962466487935655, "step": 1340}, {"loss": 1.717, "grad_norm": 0.36217355728149414, "learning_rate": 0.0002, "epoch": 1.8096514745308312, "step": 1350}, {"loss": 1.7878, "grad_norm": 0.35937434434890747, "learning_rate": 0.0002, "epoch": 1.8230563002680964, "step": 1360}, {"loss": 1.7026, "grad_norm": 0.36120304465293884, "learning_rate": 0.0002, "epoch": 1.836461126005362, "step": 1370}, {"loss": 1.7378, "grad_norm": 0.36082401871681213, "learning_rate": 0.0002, "epoch": 1.8498659517426272, "step": 1380}, {"loss": 1.6938, "grad_norm": 0.3616413176059723, "learning_rate": 0.0002, "epoch": 1.863270777479893, "step": 1390}, {"loss": 1.6998, "grad_norm": 0.3664911091327667, "learning_rate": 0.0002, "epoch": 1.876675603217158, "step": 1400}, {"loss": 1.7548, "grad_norm": 0.3545122444629669, "learning_rate": 0.0002, "epoch": 1.8900804289544237, "step": 1410}, {"loss": 1.727, "grad_norm": 0.38186976313591003, "learning_rate": 0.0002, "epoch": 1.903485254691689, "step": 1420}, {"loss": 1.788, "grad_norm": 0.41099944710731506, "learning_rate": 0.0002, "epoch": 1.9168900804289544, "step": 1430}, {"loss": 1.7377, "grad_norm": 0.34538620710372925, "learning_rate": 0.0002, "epoch": 1.9302949061662198, "step": 1440}, {"loss": 1.7349, "grad_norm": 0.35443663597106934, "learning_rate": 0.0002, "epoch": 1.9436997319034852, "step": 1450}, {"loss": 1.7457, "grad_norm": 0.4783519208431244, "learning_rate": 0.0002, "epoch": 1.9571045576407506, "step": 1460}, {"loss": 1.7073, "grad_norm": 0.36285310983657837, "learning_rate": 0.0002, "epoch": 1.970509383378016, "step": 1470}, {"loss": 1.7607, "grad_norm": 0.361730694770813, "learning_rate": 0.0002, "epoch": 1.9839142091152815, "step": 1480}, {"loss": 1.7133, "grad_norm": 0.38347867131233215, "learning_rate": 0.0002, "epoch": 1.997319034852547, "step": 1490}, {"eval_loss": 1.8150336742401123, "eval_runtime": 91.1797, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.713, "epoch": 2.0, "step": 1492}, {"loss": 1.6673, "grad_norm": 0.3648935854434967, "learning_rate": 0.0002, "epoch": 2.0107238605898123, "step": 1500}, {"loss": 1.6754, "grad_norm": 0.3521469533443451, "learning_rate": 0.0002, "epoch": 2.0241286863270775, "step": 1510}, {"loss": 1.5775, "grad_norm": 0.4275520145893097, "learning_rate": 0.0002, "epoch": 2.037533512064343, "step": 1520}, {"loss": 1.5932, "grad_norm": 0.4140888750553131, "learning_rate": 0.0002, "epoch": 2.0509383378016084, "step": 1530}, {"loss": 1.6237, "grad_norm": 0.37715452909469604, "learning_rate": 0.0002, "epoch": 2.064343163538874, "step": 1540}, {"loss": 1.6426, "grad_norm": 0.4375513195991516, "learning_rate": 0.0002, "epoch": 2.0777479892761392, "step": 1550}, {"loss": 1.6675, "grad_norm": 0.44963088631629944, "learning_rate": 0.0002, "epoch": 2.091152815013405, "step": 1560}, {"loss": 1.6731, "grad_norm": 0.45463916659355164, "learning_rate": 0.0002, "epoch": 2.10455764075067, "step": 1570}, {"loss": 1.5928, "grad_norm": 0.3952806293964386, "learning_rate": 0.0002, "epoch": 2.1179624664879357, "step": 1580}, {"loss": 1.6153, "grad_norm": 0.44873616099357605, "learning_rate": 0.0002, "epoch": 2.131367292225201, "step": 1590}, {"loss": 1.5953, "grad_norm": 0.45529067516326904, "learning_rate": 0.0002, "epoch": 2.1447721179624666, "step": 1600}, {"loss": 1.634, "grad_norm": 0.4483625590801239, "learning_rate": 0.0002, "epoch": 2.158176943699732, "step": 1610}, {"loss": 1.6202, "grad_norm": 0.3954690992832184, "learning_rate": 0.0002, "epoch": 2.1715817694369974, "step": 1620}, {"loss": 1.6657, "grad_norm": 0.4297006130218506, "learning_rate": 0.0002, "epoch": 2.1849865951742626, "step": 1630}, {"loss": 1.5499, "grad_norm": 0.4121869206428528, "learning_rate": 0.0002, "epoch": 2.1983914209115283, "step": 1640}, {"loss": 1.6017, "grad_norm": 0.45843517780303955, "learning_rate": 0.0002, "epoch": 2.2117962466487935, "step": 1650}, {"loss": 1.6699, "grad_norm": 0.44742295145988464, "learning_rate": 0.0002, "epoch": 2.225201072386059, "step": 1660}, {"loss": 1.6879, "grad_norm": 0.500198483467102, "learning_rate": 0.0002, "epoch": 2.2386058981233243, "step": 1670}, {"loss": 1.6362, "grad_norm": 0.4322265386581421, "learning_rate": 0.0002, "epoch": 2.25201072386059, "step": 1680}, {"loss": 1.6486, "grad_norm": 0.480289101600647, "learning_rate": 0.0002, "epoch": 2.265415549597855, "step": 1690}, {"loss": 1.6396, "grad_norm": 0.4532500207424164, "learning_rate": 0.0002, "epoch": 2.278820375335121, "step": 1700}, {"loss": 1.6088, "grad_norm": 0.41848474740982056, "learning_rate": 0.0002, "epoch": 2.292225201072386, "step": 1710}, {"loss": 1.6447, "grad_norm": 0.47211962938308716, "learning_rate": 0.0002, "epoch": 2.3056300268096512, "step": 1720}, {"loss": 1.7174, "grad_norm": 0.4273032248020172, "learning_rate": 0.0002, "epoch": 2.319034852546917, "step": 1730}, {"loss": 1.617, "grad_norm": 0.4660373330116272, "learning_rate": 0.0002, "epoch": 2.3324396782841825, "step": 1740}, {"loss": 1.6036, "grad_norm": 0.4409862756729126, "learning_rate": 0.0002, "epoch": 2.3458445040214477, "step": 1750}, {"loss": 1.6579, "grad_norm": 0.44795849919319153, "learning_rate": 0.0002, "epoch": 2.359249329758713, "step": 1760}, {"loss": 1.5736, "grad_norm": 0.4470100402832031, "learning_rate": 0.0002, "epoch": 2.3726541554959786, "step": 1770}, {"loss": 1.6277, "grad_norm": 0.4184521436691284, "learning_rate": 0.0002, "epoch": 2.386058981233244, "step": 1780}, {"loss": 1.6654, "grad_norm": 0.4572308659553528, "learning_rate": 0.0002, "epoch": 2.3994638069705094, "step": 1790}, {"loss": 1.6714, "grad_norm": 0.4888782501220703, "learning_rate": 0.0002, "epoch": 2.4128686327077746, "step": 1800}, {"loss": 1.7168, "grad_norm": 0.4442083239555359, "learning_rate": 0.0002, "epoch": 2.4262734584450403, "step": 1810}, {"loss": 1.6375, "grad_norm": 0.4986329972743988, "learning_rate": 0.0002, "epoch": 2.4396782841823055, "step": 1820}, {"loss": 1.6881, "grad_norm": 0.47918054461479187, "learning_rate": 0.0002, "epoch": 2.453083109919571, "step": 1830}, {"loss": 1.5969, "grad_norm": 0.42569679021835327, "learning_rate": 0.0002, "epoch": 2.4664879356568363, "step": 1840}, {"loss": 1.5751, "grad_norm": 0.4683821201324463, "learning_rate": 0.0002, "epoch": 2.479892761394102, "step": 1850}, {"loss": 1.6004, "grad_norm": 0.43605074286460876, "learning_rate": 0.0002, "epoch": 2.493297587131367, "step": 1860}, {"loss": 1.6885, "grad_norm": 0.4189167618751526, "learning_rate": 0.0002, "epoch": 2.506702412868633, "step": 1870}, {"loss": 1.6493, "grad_norm": 0.5860861539840698, "learning_rate": 0.0002, "epoch": 2.520107238605898, "step": 1880}, {"loss": 1.6563, "grad_norm": 0.4568740427494049, "learning_rate": 0.0002, "epoch": 2.5335120643431637, "step": 1890}, {"loss": 1.6653, "grad_norm": 0.4672846496105194, "learning_rate": 0.0002, "epoch": 2.546916890080429, "step": 1900}, {"loss": 1.6037, "grad_norm": 0.4280472993850708, "learning_rate": 0.0002, "epoch": 2.5603217158176945, "step": 1910}, {"loss": 1.5721, "grad_norm": 0.590728759765625, "learning_rate": 0.0002, "epoch": 2.5737265415549597, "step": 1920}, {"loss": 1.6567, "grad_norm": 0.4205126166343689, "learning_rate": 0.0002, "epoch": 2.5871313672922254, "step": 1930}, {"loss": 1.5045, "grad_norm": 0.47869905829429626, "learning_rate": 0.0002, "epoch": 2.6005361930294906, "step": 1940}, {"loss": 1.5973, "grad_norm": 0.4607323408126831, "learning_rate": 0.0002, "epoch": 2.6139410187667558, "step": 1950}, {"loss": 1.644, "grad_norm": 0.4762210547924042, "learning_rate": 0.0002, "epoch": 2.6273458445040214, "step": 1960}, {"loss": 1.6316, "grad_norm": 0.46832647919654846, "learning_rate": 0.0002, "epoch": 2.640750670241287, "step": 1970}, {"loss": 1.6591, "grad_norm": 0.4368574619293213, "learning_rate": 0.0002, "epoch": 2.6541554959785523, "step": 1980}, {"loss": 1.6359, "grad_norm": 0.5248273611068726, "learning_rate": 0.0002, "epoch": 2.6675603217158175, "step": 1990}, {"loss": 1.6879, "grad_norm": 0.46777117252349854, "learning_rate": 0.0002, "epoch": 2.680965147453083, "step": 2000}, {"loss": 1.7248, "grad_norm": 0.5201858878135681, "learning_rate": 0.0002, "epoch": 2.6943699731903488, "step": 2010}, {"loss": 1.6337, "grad_norm": 0.46777284145355225, "learning_rate": 0.0002, "epoch": 2.707774798927614, "step": 2020}, {"loss": 1.6369, "grad_norm": 0.46736642718315125, "learning_rate": 0.0002, "epoch": 2.721179624664879, "step": 2030}, {"loss": 1.6356, "grad_norm": 0.4647925794124603, "learning_rate": 0.0002, "epoch": 2.734584450402145, "step": 2040}, {"loss": 1.732, "grad_norm": 0.4298803508281708, "learning_rate": 0.0002, "epoch": 2.7479892761394105, "step": 2050}, {"loss": 1.6648, "grad_norm": 0.45485609769821167, "learning_rate": 0.0002, "epoch": 2.7613941018766757, "step": 2060}, {"loss": 1.6706, "grad_norm": 0.43687865138053894, "learning_rate": 0.0002, "epoch": 2.774798927613941, "step": 2070}, {"loss": 1.6904, "grad_norm": 0.4319164752960205, "learning_rate": 0.0002, "epoch": 2.7882037533512065, "step": 2080}, {"loss": 1.6531, "grad_norm": 0.47792428731918335, "learning_rate": 0.0002, "epoch": 2.8016085790884717, "step": 2090}, {"loss": 1.6417, "grad_norm": 0.5322234034538269, "learning_rate": 0.0002, "epoch": 2.8150134048257374, "step": 2100}, {"loss": 1.6634, "grad_norm": 0.47517943382263184, "learning_rate": 0.0002, "epoch": 2.8284182305630026, "step": 2110}, {"loss": 1.6329, "grad_norm": 0.45799025893211365, "learning_rate": 0.0002, "epoch": 2.841823056300268, "step": 2120}, {"loss": 1.6594, "grad_norm": 0.45852357149124146, "learning_rate": 0.0002, "epoch": 2.8552278820375334, "step": 2130}, {"loss": 1.61, "grad_norm": 0.4617408514022827, "learning_rate": 0.0002, "epoch": 2.868632707774799, "step": 2140}, {"loss": 1.6445, "grad_norm": 0.44205963611602783, "learning_rate": 0.0002, "epoch": 2.8820375335120643, "step": 2150}, {"loss": 1.6231, "grad_norm": 0.47173425555229187, "learning_rate": 0.0002, "epoch": 2.89544235924933, "step": 2160}, {"loss": 1.6425, "grad_norm": 0.46379899978637695, "learning_rate": 0.0002, "epoch": 2.908847184986595, "step": 2170}, {"loss": 1.6403, "grad_norm": 0.4999759793281555, "learning_rate": 0.0002, "epoch": 2.9222520107238603, "step": 2180}, {"loss": 1.6741, "grad_norm": 0.4607947766780853, "learning_rate": 0.0002, "epoch": 2.935656836461126, "step": 2190}, {"loss": 1.6889, "grad_norm": 0.4359836280345917, "learning_rate": 0.0002, "epoch": 2.9490616621983916, "step": 2200}, {"loss": 1.6478, "grad_norm": 0.5195549726486206, "learning_rate": 0.0002, "epoch": 2.962466487935657, "step": 2210}, {"loss": 1.6348, "grad_norm": 0.4914056062698364, "learning_rate": 0.0002, "epoch": 2.975871313672922, "step": 2220}, {"loss": 1.6594, "grad_norm": 0.4647377133369446, "learning_rate": 0.0002, "epoch": 2.9892761394101877, "step": 2230}]} +{"epoch": 4.0, "step": 2984, "epoch_duration": 2080.9980370998383, "total_accumulated_duration": 8161.583249568939, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 14256.0}, "peak_memory_reserved": {"GPU_0": 15414.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}, {"eval_loss": 1.8168668746948242, "eval_runtime": 90.6336, "eval_samples_per_second": 5.682, "eval_steps_per_second": 0.717, "epoch": 1.0, "step": 746}, {"loss": 1.7741, "grad_norm": 0.2998153269290924, "learning_rate": 0.0002, "epoch": 1.0053619302949062, "step": 750}, {"loss": 1.7897, "grad_norm": 0.34353747963905334, "learning_rate": 0.0002, "epoch": 1.0187667560321716, "step": 760}, {"loss": 1.6997, "grad_norm": 0.3506847321987152, "learning_rate": 0.0002, "epoch": 1.032171581769437, "step": 770}, {"loss": 1.7277, "grad_norm": 0.3434218764305115, "learning_rate": 0.0002, "epoch": 1.0455764075067024, "step": 780}, {"loss": 1.7201, "grad_norm": 0.39283573627471924, "learning_rate": 0.0002, "epoch": 1.0589812332439679, "step": 790}, {"loss": 1.7134, "grad_norm": 0.36534103751182556, "learning_rate": 0.0002, "epoch": 1.0723860589812333, "step": 800}, {"loss": 1.73, "grad_norm": 0.32713210582733154, "learning_rate": 0.0002, "epoch": 1.0857908847184987, "step": 810}, {"loss": 1.733, "grad_norm": 0.4298870861530304, "learning_rate": 0.0002, "epoch": 1.0991957104557641, "step": 820}, {"loss": 1.7152, "grad_norm": 0.3652895987033844, "learning_rate": 0.0002, "epoch": 1.1126005361930296, "step": 830}, {"loss": 1.7952, "grad_norm": 0.4341593086719513, "learning_rate": 0.0002, "epoch": 1.126005361930295, "step": 840}, {"loss": 1.7353, "grad_norm": 0.3925093412399292, "learning_rate": 0.0002, "epoch": 1.1394101876675604, "step": 850}, {"loss": 1.7484, "grad_norm": 0.3695056736469269, "learning_rate": 0.0002, "epoch": 1.1528150134048256, "step": 860}, {"loss": 1.7959, "grad_norm": 0.36138468980789185, "learning_rate": 0.0002, "epoch": 1.1662198391420913, "step": 870}, {"loss": 1.7144, "grad_norm": 0.33074072003364563, "learning_rate": 0.0002, "epoch": 1.1796246648793565, "step": 880}, {"loss": 1.7303, "grad_norm": 0.3552579879760742, "learning_rate": 0.0002, "epoch": 1.193029490616622, "step": 890}, {"loss": 1.6857, "grad_norm": 0.38744238018989563, "learning_rate": 0.0002, "epoch": 1.2064343163538873, "step": 900}, {"loss": 1.7543, "grad_norm": 0.3563305735588074, "learning_rate": 0.0002, "epoch": 1.2198391420911527, "step": 910}, {"loss": 1.7406, "grad_norm": 0.35686084628105164, "learning_rate": 0.0002, "epoch": 1.2332439678284182, "step": 920}, {"loss": 1.765, "grad_norm": 0.4001927077770233, "learning_rate": 0.0002, "epoch": 1.2466487935656836, "step": 930}, {"loss": 1.7147, "grad_norm": 0.35909149050712585, "learning_rate": 0.0002, "epoch": 1.260053619302949, "step": 940}, {"loss": 1.6712, "grad_norm": 0.35123375058174133, "learning_rate": 0.0002, "epoch": 1.2734584450402144, "step": 950}, {"loss": 1.7245, "grad_norm": 0.38013333082199097, "learning_rate": 0.0002, "epoch": 1.2868632707774799, "step": 960}, {"loss": 1.7395, "grad_norm": 0.373146653175354, "learning_rate": 0.0002, "epoch": 1.3002680965147453, "step": 970}, {"loss": 1.707, "grad_norm": 0.4208183288574219, "learning_rate": 0.0002, "epoch": 1.3136729222520107, "step": 980}, {"loss": 1.7122, "grad_norm": 0.3613564074039459, "learning_rate": 0.0002, "epoch": 1.3270777479892761, "step": 990}, {"loss": 1.6776, "grad_norm": 0.34058499336242676, "learning_rate": 0.0002, "epoch": 1.3404825737265416, "step": 1000}, {"loss": 1.7072, "grad_norm": 0.3563075065612793, "learning_rate": 0.0002, "epoch": 1.353887399463807, "step": 1010}, {"loss": 1.7167, "grad_norm": 0.36920854449272156, "learning_rate": 0.0002, "epoch": 1.3672922252010724, "step": 1020}, {"loss": 1.7143, "grad_norm": 0.3889519274234772, "learning_rate": 0.0002, "epoch": 1.3806970509383378, "step": 1030}, {"loss": 1.8023, "grad_norm": 0.3664555251598358, "learning_rate": 0.0002, "epoch": 1.3941018766756033, "step": 1040}, {"loss": 1.7961, "grad_norm": 0.38175567984580994, "learning_rate": 0.0002, "epoch": 1.4075067024128687, "step": 1050}, {"loss": 1.7363, "grad_norm": 0.42346763610839844, "learning_rate": 0.0002, "epoch": 1.420911528150134, "step": 1060}, {"loss": 1.708, "grad_norm": 0.3456033170223236, "learning_rate": 0.0002, "epoch": 1.4343163538873995, "step": 1070}, {"loss": 1.6846, "grad_norm": 0.38931941986083984, "learning_rate": 0.0002, "epoch": 1.447721179624665, "step": 1080}, {"loss": 1.7416, "grad_norm": 0.5473279356956482, "learning_rate": 0.0002, "epoch": 1.4611260053619302, "step": 1090}, {"loss": 1.6927, "grad_norm": 0.3517422676086426, "learning_rate": 0.0002, "epoch": 1.4745308310991958, "step": 1100}, {"loss": 1.7213, "grad_norm": 0.3511943221092224, "learning_rate": 0.0002, "epoch": 1.487935656836461, "step": 1110}, {"loss": 1.7947, "grad_norm": 0.3762837052345276, "learning_rate": 0.0002, "epoch": 1.5013404825737267, "step": 1120}, {"loss": 1.6893, "grad_norm": 0.37149128317832947, "learning_rate": 0.0002, "epoch": 1.5147453083109919, "step": 1130}, {"loss": 1.6944, "grad_norm": 0.3945842981338501, "learning_rate": 0.0002, "epoch": 1.5281501340482575, "step": 1140}, {"loss": 1.7254, "grad_norm": 0.40258195996284485, "learning_rate": 0.0002, "epoch": 1.5415549597855227, "step": 1150}, {"loss": 1.6798, "grad_norm": 0.3959120213985443, "learning_rate": 0.0002, "epoch": 1.5549597855227884, "step": 1160}, {"loss": 1.7789, "grad_norm": 0.37792712450027466, "learning_rate": 0.0002, "epoch": 1.5683646112600536, "step": 1170}, {"loss": 1.7953, "grad_norm": 0.4019201099872589, "learning_rate": 0.0002, "epoch": 1.5817694369973192, "step": 1180}, {"loss": 1.6887, "grad_norm": 0.40712273120880127, "learning_rate": 0.0002, "epoch": 1.5951742627345844, "step": 1190}, {"loss": 1.7131, "grad_norm": 0.4131423234939575, "learning_rate": 0.0002, "epoch": 1.6085790884718498, "step": 1200}, {"loss": 1.6757, "grad_norm": 0.3738194704055786, "learning_rate": 0.0002, "epoch": 1.6219839142091153, "step": 1210}, {"loss": 1.7629, "grad_norm": 0.3987765908241272, "learning_rate": 0.0002, "epoch": 1.6353887399463807, "step": 1220}, {"loss": 1.7374, "grad_norm": 0.34117406606674194, "learning_rate": 0.0002, "epoch": 1.648793565683646, "step": 1230}, {"loss": 1.7869, "grad_norm": 0.34900516271591187, "learning_rate": 0.0002, "epoch": 1.6621983914209115, "step": 1240}, {"loss": 1.7162, "grad_norm": 0.35759788751602173, "learning_rate": 0.0002, "epoch": 1.675603217158177, "step": 1250}, {"loss": 1.7697, "grad_norm": 0.3837822377681732, "learning_rate": 0.0002, "epoch": 1.6890080428954424, "step": 1260}, {"loss": 1.7972, "grad_norm": 0.3671180307865143, "learning_rate": 0.0002, "epoch": 1.7024128686327078, "step": 1270}, {"loss": 1.7198, "grad_norm": 0.4124658703804016, "learning_rate": 0.0002, "epoch": 1.7158176943699732, "step": 1280}, {"loss": 1.8006, "grad_norm": 0.39059901237487793, "learning_rate": 0.0002, "epoch": 1.7292225201072386, "step": 1290}, {"loss": 1.7721, "grad_norm": 0.4006287157535553, "learning_rate": 0.0002, "epoch": 1.742627345844504, "step": 1300}, {"loss": 1.8196, "grad_norm": 0.3606216013431549, "learning_rate": 0.0002, "epoch": 1.7560321715817695, "step": 1310}, {"loss": 1.7213, "grad_norm": 0.3861924111843109, "learning_rate": 0.0002, "epoch": 1.7694369973190347, "step": 1320}, {"loss": 1.7849, "grad_norm": 0.41432589292526245, "learning_rate": 0.0002, "epoch": 1.7828418230563003, "step": 1330}, {"loss": 1.7069, "grad_norm": 0.3751705586910248, "learning_rate": 0.0002, "epoch": 1.7962466487935655, "step": 1340}, {"loss": 1.717, "grad_norm": 0.36217355728149414, "learning_rate": 0.0002, "epoch": 1.8096514745308312, "step": 1350}, {"loss": 1.7878, "grad_norm": 0.35937434434890747, "learning_rate": 0.0002, "epoch": 1.8230563002680964, "step": 1360}, {"loss": 1.7026, "grad_norm": 0.36120304465293884, "learning_rate": 0.0002, "epoch": 1.836461126005362, "step": 1370}, {"loss": 1.7378, "grad_norm": 0.36082401871681213, "learning_rate": 0.0002, "epoch": 1.8498659517426272, "step": 1380}, {"loss": 1.6938, "grad_norm": 0.3616413176059723, "learning_rate": 0.0002, "epoch": 1.863270777479893, "step": 1390}, {"loss": 1.6998, "grad_norm": 0.3664911091327667, "learning_rate": 0.0002, "epoch": 1.876675603217158, "step": 1400}, {"loss": 1.7548, "grad_norm": 0.3545122444629669, "learning_rate": 0.0002, "epoch": 1.8900804289544237, "step": 1410}, {"loss": 1.727, "grad_norm": 0.38186976313591003, "learning_rate": 0.0002, "epoch": 1.903485254691689, "step": 1420}, {"loss": 1.788, "grad_norm": 0.41099944710731506, "learning_rate": 0.0002, "epoch": 1.9168900804289544, "step": 1430}, {"loss": 1.7377, "grad_norm": 0.34538620710372925, "learning_rate": 0.0002, "epoch": 1.9302949061662198, "step": 1440}, {"loss": 1.7349, "grad_norm": 0.35443663597106934, "learning_rate": 0.0002, "epoch": 1.9436997319034852, "step": 1450}, {"loss": 1.7457, "grad_norm": 0.4783519208431244, "learning_rate": 0.0002, "epoch": 1.9571045576407506, "step": 1460}, {"loss": 1.7073, "grad_norm": 0.36285310983657837, "learning_rate": 0.0002, "epoch": 1.970509383378016, "step": 1470}, {"loss": 1.7607, "grad_norm": 0.361730694770813, "learning_rate": 0.0002, "epoch": 1.9839142091152815, "step": 1480}, {"loss": 1.7133, "grad_norm": 0.38347867131233215, "learning_rate": 0.0002, "epoch": 1.997319034852547, "step": 1490}, {"eval_loss": 1.8150336742401123, "eval_runtime": 91.1797, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.713, "epoch": 2.0, "step": 1492}, {"loss": 1.6673, "grad_norm": 0.3648935854434967, "learning_rate": 0.0002, "epoch": 2.0107238605898123, "step": 1500}, {"loss": 1.6754, "grad_norm": 0.3521469533443451, "learning_rate": 0.0002, "epoch": 2.0241286863270775, "step": 1510}, {"loss": 1.5775, "grad_norm": 0.4275520145893097, "learning_rate": 0.0002, "epoch": 2.037533512064343, "step": 1520}, {"loss": 1.5932, "grad_norm": 0.4140888750553131, "learning_rate": 0.0002, "epoch": 2.0509383378016084, "step": 1530}, {"loss": 1.6237, "grad_norm": 0.37715452909469604, "learning_rate": 0.0002, "epoch": 2.064343163538874, "step": 1540}, {"loss": 1.6426, "grad_norm": 0.4375513195991516, "learning_rate": 0.0002, "epoch": 2.0777479892761392, "step": 1550}, {"loss": 1.6675, "grad_norm": 0.44963088631629944, "learning_rate": 0.0002, "epoch": 2.091152815013405, "step": 1560}, {"loss": 1.6731, "grad_norm": 0.45463916659355164, "learning_rate": 0.0002, "epoch": 2.10455764075067, "step": 1570}, {"loss": 1.5928, "grad_norm": 0.3952806293964386, "learning_rate": 0.0002, "epoch": 2.1179624664879357, "step": 1580}, {"loss": 1.6153, "grad_norm": 0.44873616099357605, "learning_rate": 0.0002, "epoch": 2.131367292225201, "step": 1590}, {"loss": 1.5953, "grad_norm": 0.45529067516326904, "learning_rate": 0.0002, "epoch": 2.1447721179624666, "step": 1600}, {"loss": 1.634, "grad_norm": 0.4483625590801239, "learning_rate": 0.0002, "epoch": 2.158176943699732, "step": 1610}, {"loss": 1.6202, "grad_norm": 0.3954690992832184, "learning_rate": 0.0002, "epoch": 2.1715817694369974, "step": 1620}, {"loss": 1.6657, "grad_norm": 0.4297006130218506, "learning_rate": 0.0002, "epoch": 2.1849865951742626, "step": 1630}, {"loss": 1.5499, "grad_norm": 0.4121869206428528, "learning_rate": 0.0002, "epoch": 2.1983914209115283, "step": 1640}, {"loss": 1.6017, "grad_norm": 0.45843517780303955, "learning_rate": 0.0002, "epoch": 2.2117962466487935, "step": 1650}, {"loss": 1.6699, "grad_norm": 0.44742295145988464, "learning_rate": 0.0002, "epoch": 2.225201072386059, "step": 1660}, {"loss": 1.6879, "grad_norm": 0.500198483467102, "learning_rate": 0.0002, "epoch": 2.2386058981233243, "step": 1670}, {"loss": 1.6362, "grad_norm": 0.4322265386581421, "learning_rate": 0.0002, "epoch": 2.25201072386059, "step": 1680}, {"loss": 1.6486, "grad_norm": 0.480289101600647, "learning_rate": 0.0002, "epoch": 2.265415549597855, "step": 1690}, {"loss": 1.6396, "grad_norm": 0.4532500207424164, "learning_rate": 0.0002, "epoch": 2.278820375335121, "step": 1700}, {"loss": 1.6088, "grad_norm": 0.41848474740982056, "learning_rate": 0.0002, "epoch": 2.292225201072386, "step": 1710}, {"loss": 1.6447, "grad_norm": 0.47211962938308716, "learning_rate": 0.0002, "epoch": 2.3056300268096512, "step": 1720}, {"loss": 1.7174, "grad_norm": 0.4273032248020172, "learning_rate": 0.0002, "epoch": 2.319034852546917, "step": 1730}, {"loss": 1.617, "grad_norm": 0.4660373330116272, "learning_rate": 0.0002, "epoch": 2.3324396782841825, "step": 1740}, {"loss": 1.6036, "grad_norm": 0.4409862756729126, "learning_rate": 0.0002, "epoch": 2.3458445040214477, "step": 1750}, {"loss": 1.6579, "grad_norm": 0.44795849919319153, "learning_rate": 0.0002, "epoch": 2.359249329758713, "step": 1760}, {"loss": 1.5736, "grad_norm": 0.4470100402832031, "learning_rate": 0.0002, "epoch": 2.3726541554959786, "step": 1770}, {"loss": 1.6277, "grad_norm": 0.4184521436691284, "learning_rate": 0.0002, "epoch": 2.386058981233244, "step": 1780}, {"loss": 1.6654, "grad_norm": 0.4572308659553528, "learning_rate": 0.0002, "epoch": 2.3994638069705094, "step": 1790}, {"loss": 1.6714, "grad_norm": 0.4888782501220703, "learning_rate": 0.0002, "epoch": 2.4128686327077746, "step": 1800}, {"loss": 1.7168, "grad_norm": 0.4442083239555359, "learning_rate": 0.0002, "epoch": 2.4262734584450403, "step": 1810}, {"loss": 1.6375, "grad_norm": 0.4986329972743988, "learning_rate": 0.0002, "epoch": 2.4396782841823055, "step": 1820}, {"loss": 1.6881, "grad_norm": 0.47918054461479187, "learning_rate": 0.0002, "epoch": 2.453083109919571, "step": 1830}, {"loss": 1.5969, "grad_norm": 0.42569679021835327, "learning_rate": 0.0002, "epoch": 2.4664879356568363, "step": 1840}, {"loss": 1.5751, "grad_norm": 0.4683821201324463, "learning_rate": 0.0002, "epoch": 2.479892761394102, "step": 1850}, {"loss": 1.6004, "grad_norm": 0.43605074286460876, "learning_rate": 0.0002, "epoch": 2.493297587131367, "step": 1860}, {"loss": 1.6885, "grad_norm": 0.4189167618751526, "learning_rate": 0.0002, "epoch": 2.506702412868633, "step": 1870}, {"loss": 1.6493, "grad_norm": 0.5860861539840698, "learning_rate": 0.0002, "epoch": 2.520107238605898, "step": 1880}, {"loss": 1.6563, "grad_norm": 0.4568740427494049, "learning_rate": 0.0002, "epoch": 2.5335120643431637, "step": 1890}, {"loss": 1.6653, "grad_norm": 0.4672846496105194, "learning_rate": 0.0002, "epoch": 2.546916890080429, "step": 1900}, {"loss": 1.6037, "grad_norm": 0.4280472993850708, "learning_rate": 0.0002, "epoch": 2.5603217158176945, "step": 1910}, {"loss": 1.5721, "grad_norm": 0.590728759765625, "learning_rate": 0.0002, "epoch": 2.5737265415549597, "step": 1920}, {"loss": 1.6567, "grad_norm": 0.4205126166343689, "learning_rate": 0.0002, "epoch": 2.5871313672922254, "step": 1930}, {"loss": 1.5045, "grad_norm": 0.47869905829429626, "learning_rate": 0.0002, "epoch": 2.6005361930294906, "step": 1940}, {"loss": 1.5973, "grad_norm": 0.4607323408126831, "learning_rate": 0.0002, "epoch": 2.6139410187667558, "step": 1950}, {"loss": 1.644, "grad_norm": 0.4762210547924042, "learning_rate": 0.0002, "epoch": 2.6273458445040214, "step": 1960}, {"loss": 1.6316, "grad_norm": 0.46832647919654846, "learning_rate": 0.0002, "epoch": 2.640750670241287, "step": 1970}, {"loss": 1.6591, "grad_norm": 0.4368574619293213, "learning_rate": 0.0002, "epoch": 2.6541554959785523, "step": 1980}, {"loss": 1.6359, "grad_norm": 0.5248273611068726, "learning_rate": 0.0002, "epoch": 2.6675603217158175, "step": 1990}, {"loss": 1.6879, "grad_norm": 0.46777117252349854, "learning_rate": 0.0002, "epoch": 2.680965147453083, "step": 2000}, {"loss": 1.7248, "grad_norm": 0.5201858878135681, "learning_rate": 0.0002, "epoch": 2.6943699731903488, "step": 2010}, {"loss": 1.6337, "grad_norm": 0.46777284145355225, "learning_rate": 0.0002, "epoch": 2.707774798927614, "step": 2020}, {"loss": 1.6369, "grad_norm": 0.46736642718315125, "learning_rate": 0.0002, "epoch": 2.721179624664879, "step": 2030}, {"loss": 1.6356, "grad_norm": 0.4647925794124603, "learning_rate": 0.0002, "epoch": 2.734584450402145, "step": 2040}, {"loss": 1.732, "grad_norm": 0.4298803508281708, "learning_rate": 0.0002, "epoch": 2.7479892761394105, "step": 2050}, {"loss": 1.6648, "grad_norm": 0.45485609769821167, "learning_rate": 0.0002, "epoch": 2.7613941018766757, "step": 2060}, {"loss": 1.6706, "grad_norm": 0.43687865138053894, "learning_rate": 0.0002, "epoch": 2.774798927613941, "step": 2070}, {"loss": 1.6904, "grad_norm": 0.4319164752960205, "learning_rate": 0.0002, "epoch": 2.7882037533512065, "step": 2080}, {"loss": 1.6531, "grad_norm": 0.47792428731918335, "learning_rate": 0.0002, "epoch": 2.8016085790884717, "step": 2090}, {"loss": 1.6417, "grad_norm": 0.5322234034538269, "learning_rate": 0.0002, "epoch": 2.8150134048257374, "step": 2100}, {"loss": 1.6634, "grad_norm": 0.47517943382263184, "learning_rate": 0.0002, "epoch": 2.8284182305630026, "step": 2110}, {"loss": 1.6329, "grad_norm": 0.45799025893211365, "learning_rate": 0.0002, "epoch": 2.841823056300268, "step": 2120}, {"loss": 1.6594, "grad_norm": 0.45852357149124146, "learning_rate": 0.0002, "epoch": 2.8552278820375334, "step": 2130}, {"loss": 1.61, "grad_norm": 0.4617408514022827, "learning_rate": 0.0002, "epoch": 2.868632707774799, "step": 2140}, {"loss": 1.6445, "grad_norm": 0.44205963611602783, "learning_rate": 0.0002, "epoch": 2.8820375335120643, "step": 2150}, {"loss": 1.6231, "grad_norm": 0.47173425555229187, "learning_rate": 0.0002, "epoch": 2.89544235924933, "step": 2160}, {"loss": 1.6425, "grad_norm": 0.46379899978637695, "learning_rate": 0.0002, "epoch": 2.908847184986595, "step": 2170}, {"loss": 1.6403, "grad_norm": 0.4999759793281555, "learning_rate": 0.0002, "epoch": 2.9222520107238603, "step": 2180}, {"loss": 1.6741, "grad_norm": 0.4607947766780853, "learning_rate": 0.0002, "epoch": 2.935656836461126, "step": 2190}, {"loss": 1.6889, "grad_norm": 0.4359836280345917, "learning_rate": 0.0002, "epoch": 2.9490616621983916, "step": 2200}, {"loss": 1.6478, "grad_norm": 0.5195549726486206, "learning_rate": 0.0002, "epoch": 2.962466487935657, "step": 2210}, {"loss": 1.6348, "grad_norm": 0.4914056062698364, "learning_rate": 0.0002, "epoch": 2.975871313672922, "step": 2220}, {"loss": 1.6594, "grad_norm": 0.4647377133369446, "learning_rate": 0.0002, "epoch": 2.9892761394101877, "step": 2230}, {"eval_loss": 1.8368606567382812, "eval_runtime": 90.5623, "eval_samples_per_second": 5.687, "eval_steps_per_second": 0.718, "epoch": 3.0, "step": 2238}, {"loss": 1.5704, "grad_norm": 0.40689945220947266, "learning_rate": 0.0002, "epoch": 3.002680965147453, "step": 2240}, {"loss": 1.5961, "grad_norm": 0.4699273705482483, "learning_rate": 0.0002, "epoch": 3.0160857908847185, "step": 2250}, {"loss": 1.5182, "grad_norm": 0.5531830787658691, "learning_rate": 0.0002, "epoch": 3.0294906166219837, "step": 2260}, {"loss": 1.4924, "grad_norm": 0.5441790223121643, "learning_rate": 0.0002, "epoch": 3.0428954423592494, "step": 2270}, {"loss": 1.4953, "grad_norm": 0.6145012974739075, "learning_rate": 0.0002, "epoch": 3.0563002680965146, "step": 2280}, {"loss": 1.4861, "grad_norm": 0.6997102499008179, "learning_rate": 0.0002, "epoch": 3.06970509383378, "step": 2290}, {"loss": 1.5853, "grad_norm": 0.6082330942153931, "learning_rate": 0.0002, "epoch": 3.0831099195710454, "step": 2300}, {"loss": 1.5377, "grad_norm": 0.5294155478477478, "learning_rate": 0.0002, "epoch": 3.096514745308311, "step": 2310}, {"loss": 1.5452, "grad_norm": 0.7200340032577515, "learning_rate": 0.0002, "epoch": 3.1099195710455763, "step": 2320}, {"loss": 1.5296, "grad_norm": 0.721092939376831, "learning_rate": 0.0002, "epoch": 3.123324396782842, "step": 2330}, {"loss": 1.5307, "grad_norm": 0.5344305038452148, "learning_rate": 0.0002, "epoch": 3.136729222520107, "step": 2340}, {"loss": 1.4347, "grad_norm": 0.5533145070075989, "learning_rate": 0.0002, "epoch": 3.1501340482573728, "step": 2350}, {"loss": 1.529, "grad_norm": 0.5976856350898743, "learning_rate": 0.0002, "epoch": 3.163538873994638, "step": 2360}, {"loss": 1.6044, "grad_norm": 0.4974960386753082, "learning_rate": 0.0002, "epoch": 3.1769436997319036, "step": 2370}, {"loss": 1.5554, "grad_norm": 0.6377840042114258, "learning_rate": 0.0002, "epoch": 3.190348525469169, "step": 2380}, {"loss": 1.5322, "grad_norm": 0.5447293519973755, "learning_rate": 0.0002, "epoch": 3.2037533512064345, "step": 2390}, {"loss": 1.5127, "grad_norm": 0.49577030539512634, "learning_rate": 0.0002, "epoch": 3.2171581769436997, "step": 2400}, {"loss": 1.4768, "grad_norm": 0.5588275790214539, "learning_rate": 0.0002, "epoch": 3.2305630026809653, "step": 2410}, {"loss": 1.4755, "grad_norm": 0.6429149508476257, "learning_rate": 0.0002, "epoch": 3.2439678284182305, "step": 2420}, {"loss": 1.5596, "grad_norm": 0.5713154673576355, "learning_rate": 0.0002, "epoch": 3.257372654155496, "step": 2430}, {"loss": 1.4763, "grad_norm": 0.6348955035209656, "learning_rate": 0.0002, "epoch": 3.2707774798927614, "step": 2440}, {"loss": 1.509, "grad_norm": 0.5675528645515442, "learning_rate": 0.0002, "epoch": 3.284182305630027, "step": 2450}, {"loss": 1.5867, "grad_norm": 0.5570188164710999, "learning_rate": 0.0002, "epoch": 3.297587131367292, "step": 2460}, {"loss": 1.554, "grad_norm": 0.6029602289199829, "learning_rate": 0.0002, "epoch": 3.310991957104558, "step": 2470}, {"loss": 1.5094, "grad_norm": 0.523206353187561, "learning_rate": 0.0002, "epoch": 3.324396782841823, "step": 2480}, {"loss": 1.4854, "grad_norm": 0.5912408828735352, "learning_rate": 0.0002, "epoch": 3.3378016085790883, "step": 2490}, {"loss": 1.5097, "grad_norm": 0.5524865984916687, "learning_rate": 0.0002, "epoch": 3.351206434316354, "step": 2500}, {"loss": 1.5064, "grad_norm": 0.60386061668396, "learning_rate": 0.0002, "epoch": 3.3646112600536195, "step": 2510}, {"loss": 1.564, "grad_norm": 0.5838595628738403, "learning_rate": 0.0002, "epoch": 3.3780160857908847, "step": 2520}, {"loss": 1.4615, "grad_norm": 0.5400974154472351, "learning_rate": 0.0002, "epoch": 3.39142091152815, "step": 2530}, {"loss": 1.5349, "grad_norm": 0.6150162220001221, "learning_rate": 0.0002, "epoch": 3.4048257372654156, "step": 2540}, {"loss": 1.5978, "grad_norm": 0.5279412269592285, "learning_rate": 0.0002, "epoch": 3.418230563002681, "step": 2550}, {"loss": 1.5063, "grad_norm": 0.5974063873291016, "learning_rate": 0.0002, "epoch": 3.4316353887399464, "step": 2560}, {"loss": 1.5825, "grad_norm": 0.661573052406311, "learning_rate": 0.0002, "epoch": 3.4450402144772116, "step": 2570}, {"loss": 1.5204, "grad_norm": 0.577880322933197, "learning_rate": 0.0002, "epoch": 3.4584450402144773, "step": 2580}, {"loss": 1.5295, "grad_norm": 0.5532318949699402, "learning_rate": 0.0002, "epoch": 3.4718498659517425, "step": 2590}, {"loss": 1.4933, "grad_norm": 0.5764921307563782, "learning_rate": 0.0002, "epoch": 3.485254691689008, "step": 2600}, {"loss": 1.4355, "grad_norm": 0.6145682334899902, "learning_rate": 0.0002, "epoch": 3.4986595174262733, "step": 2610}, {"loss": 1.4968, "grad_norm": 0.6561126112937927, "learning_rate": 0.0002, "epoch": 3.512064343163539, "step": 2620}, {"loss": 1.5309, "grad_norm": 0.5673288106918335, "learning_rate": 0.0002, "epoch": 3.525469168900804, "step": 2630}, {"loss": 1.5274, "grad_norm": 0.6215338706970215, "learning_rate": 0.0002, "epoch": 3.53887399463807, "step": 2640}, {"loss": 1.5117, "grad_norm": 0.5512040853500366, "learning_rate": 0.0002, "epoch": 3.552278820375335, "step": 2650}, {"loss": 1.5188, "grad_norm": 0.49503496289253235, "learning_rate": 0.0002, "epoch": 3.5656836461126007, "step": 2660}, {"loss": 1.524, "grad_norm": 0.5714912414550781, "learning_rate": 0.0002, "epoch": 3.579088471849866, "step": 2670}, {"loss": 1.4651, "grad_norm": 0.6883154511451721, "learning_rate": 0.0002, "epoch": 3.592493297587131, "step": 2680}, {"loss": 1.5174, "grad_norm": 0.5989556908607483, "learning_rate": 0.0002, "epoch": 3.6058981233243967, "step": 2690}, {"loss": 1.5335, "grad_norm": 0.630268394947052, "learning_rate": 0.0002, "epoch": 3.6193029490616624, "step": 2700}, {"loss": 1.4681, "grad_norm": 0.5819358229637146, "learning_rate": 0.0002, "epoch": 3.6327077747989276, "step": 2710}, {"loss": 1.5676, "grad_norm": 0.6102097034454346, "learning_rate": 0.0002, "epoch": 3.646112600536193, "step": 2720}, {"loss": 1.5566, "grad_norm": 0.6858501434326172, "learning_rate": 0.0002, "epoch": 3.6595174262734584, "step": 2730}, {"loss": 1.5242, "grad_norm": 0.6328608393669128, "learning_rate": 0.0002, "epoch": 3.672922252010724, "step": 2740}, {"loss": 1.5211, "grad_norm": 0.5366981029510498, "learning_rate": 0.0002, "epoch": 3.6863270777479893, "step": 2750}, {"loss": 1.5532, "grad_norm": 0.7048938274383545, "learning_rate": 0.0002, "epoch": 3.6997319034852545, "step": 2760}, {"loss": 1.5001, "grad_norm": 0.5371938347816467, "learning_rate": 0.0002, "epoch": 3.71313672922252, "step": 2770}, {"loss": 1.557, "grad_norm": 0.6142212152481079, "learning_rate": 0.0002, "epoch": 3.726541554959786, "step": 2780}, {"loss": 1.5191, "grad_norm": 0.6164522171020508, "learning_rate": 0.0002, "epoch": 3.739946380697051, "step": 2790}, {"loss": 1.5071, "grad_norm": 0.7511836886405945, "learning_rate": 0.0002, "epoch": 3.753351206434316, "step": 2800}, {"loss": 1.5775, "grad_norm": 0.6194717288017273, "learning_rate": 0.0002, "epoch": 3.766756032171582, "step": 2810}, {"loss": 1.5721, "grad_norm": 0.676721453666687, "learning_rate": 0.0002, "epoch": 3.780160857908847, "step": 2820}, {"loss": 1.502, "grad_norm": 0.5646911263465881, "learning_rate": 0.0002, "epoch": 3.7935656836461127, "step": 2830}, {"loss": 1.4871, "grad_norm": 0.5874826908111572, "learning_rate": 0.0002, "epoch": 3.806970509383378, "step": 2840}, {"loss": 1.5046, "grad_norm": 0.6395232677459717, "learning_rate": 0.0002, "epoch": 3.8203753351206435, "step": 2850}, {"loss": 1.5088, "grad_norm": 0.624563992023468, "learning_rate": 0.0002, "epoch": 3.8337801608579087, "step": 2860}, {"loss": 1.479, "grad_norm": 0.59019935131073, "learning_rate": 0.0002, "epoch": 3.8471849865951744, "step": 2870}, {"loss": 1.4693, "grad_norm": 0.6700479984283447, "learning_rate": 0.0002, "epoch": 3.8605898123324396, "step": 2880}, {"loss": 1.5032, "grad_norm": 0.6131282448768616, "learning_rate": 0.0002, "epoch": 3.8739946380697052, "step": 2890}, {"loss": 1.5446, "grad_norm": 0.6807777881622314, "learning_rate": 0.0002, "epoch": 3.8873994638069704, "step": 2900}, {"loss": 1.5618, "grad_norm": 0.5297217965126038, "learning_rate": 0.0002, "epoch": 3.900804289544236, "step": 2910}, {"loss": 1.5046, "grad_norm": 0.5795540809631348, "learning_rate": 0.0002, "epoch": 3.9142091152815013, "step": 2920}, {"loss": 1.5155, "grad_norm": 0.5549747347831726, "learning_rate": 0.0002, "epoch": 3.927613941018767, "step": 2930}, {"loss": 1.5932, "grad_norm": 0.5895092487335205, "learning_rate": 0.0002, "epoch": 3.941018766756032, "step": 2940}, {"loss": 1.5831, "grad_norm": 0.590002715587616, "learning_rate": 0.0002, "epoch": 3.9544235924932973, "step": 2950}, {"loss": 1.592, "grad_norm": 0.7847695350646973, "learning_rate": 0.0002, "epoch": 3.967828418230563, "step": 2960}, {"loss": 1.4892, "grad_norm": 0.5845848321914673, "learning_rate": 0.0002, "epoch": 3.9812332439678286, "step": 2970}, {"loss": 1.5094, "grad_norm": 0.5861571431159973, "learning_rate": 0.0002, "epoch": 3.994638069705094, "step": 2980}]} +{"epoch": 5.0, "step": 3730, "epoch_duration": 2022.914943933487, "total_accumulated_duration": 10184.498193502426, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 14256.0}, "peak_memory_reserved": {"GPU_0": 15414.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}, {"eval_loss": 1.8168668746948242, "eval_runtime": 90.6336, "eval_samples_per_second": 5.682, "eval_steps_per_second": 0.717, "epoch": 1.0, "step": 746}, {"loss": 1.7741, "grad_norm": 0.2998153269290924, "learning_rate": 0.0002, "epoch": 1.0053619302949062, "step": 750}, {"loss": 1.7897, "grad_norm": 0.34353747963905334, "learning_rate": 0.0002, "epoch": 1.0187667560321716, "step": 760}, {"loss": 1.6997, "grad_norm": 0.3506847321987152, "learning_rate": 0.0002, "epoch": 1.032171581769437, "step": 770}, {"loss": 1.7277, "grad_norm": 0.3434218764305115, "learning_rate": 0.0002, "epoch": 1.0455764075067024, "step": 780}, {"loss": 1.7201, "grad_norm": 0.39283573627471924, "learning_rate": 0.0002, "epoch": 1.0589812332439679, "step": 790}, {"loss": 1.7134, "grad_norm": 0.36534103751182556, "learning_rate": 0.0002, "epoch": 1.0723860589812333, "step": 800}, {"loss": 1.73, "grad_norm": 0.32713210582733154, "learning_rate": 0.0002, "epoch": 1.0857908847184987, "step": 810}, {"loss": 1.733, "grad_norm": 0.4298870861530304, "learning_rate": 0.0002, "epoch": 1.0991957104557641, "step": 820}, {"loss": 1.7152, "grad_norm": 0.3652895987033844, "learning_rate": 0.0002, "epoch": 1.1126005361930296, "step": 830}, {"loss": 1.7952, "grad_norm": 0.4341593086719513, "learning_rate": 0.0002, "epoch": 1.126005361930295, "step": 840}, {"loss": 1.7353, "grad_norm": 0.3925093412399292, "learning_rate": 0.0002, "epoch": 1.1394101876675604, "step": 850}, {"loss": 1.7484, "grad_norm": 0.3695056736469269, "learning_rate": 0.0002, "epoch": 1.1528150134048256, "step": 860}, {"loss": 1.7959, "grad_norm": 0.36138468980789185, "learning_rate": 0.0002, "epoch": 1.1662198391420913, "step": 870}, {"loss": 1.7144, "grad_norm": 0.33074072003364563, "learning_rate": 0.0002, "epoch": 1.1796246648793565, "step": 880}, {"loss": 1.7303, "grad_norm": 0.3552579879760742, "learning_rate": 0.0002, "epoch": 1.193029490616622, "step": 890}, {"loss": 1.6857, "grad_norm": 0.38744238018989563, "learning_rate": 0.0002, "epoch": 1.2064343163538873, "step": 900}, {"loss": 1.7543, "grad_norm": 0.3563305735588074, "learning_rate": 0.0002, "epoch": 1.2198391420911527, "step": 910}, {"loss": 1.7406, "grad_norm": 0.35686084628105164, "learning_rate": 0.0002, "epoch": 1.2332439678284182, "step": 920}, {"loss": 1.765, "grad_norm": 0.4001927077770233, "learning_rate": 0.0002, "epoch": 1.2466487935656836, "step": 930}, {"loss": 1.7147, "grad_norm": 0.35909149050712585, "learning_rate": 0.0002, "epoch": 1.260053619302949, "step": 940}, {"loss": 1.6712, "grad_norm": 0.35123375058174133, "learning_rate": 0.0002, "epoch": 1.2734584450402144, "step": 950}, {"loss": 1.7245, "grad_norm": 0.38013333082199097, "learning_rate": 0.0002, "epoch": 1.2868632707774799, "step": 960}, {"loss": 1.7395, "grad_norm": 0.373146653175354, "learning_rate": 0.0002, "epoch": 1.3002680965147453, "step": 970}, {"loss": 1.707, "grad_norm": 0.4208183288574219, "learning_rate": 0.0002, "epoch": 1.3136729222520107, "step": 980}, {"loss": 1.7122, "grad_norm": 0.3613564074039459, "learning_rate": 0.0002, "epoch": 1.3270777479892761, "step": 990}, {"loss": 1.6776, "grad_norm": 0.34058499336242676, "learning_rate": 0.0002, "epoch": 1.3404825737265416, "step": 1000}, {"loss": 1.7072, "grad_norm": 0.3563075065612793, "learning_rate": 0.0002, "epoch": 1.353887399463807, "step": 1010}, {"loss": 1.7167, "grad_norm": 0.36920854449272156, "learning_rate": 0.0002, "epoch": 1.3672922252010724, "step": 1020}, {"loss": 1.7143, "grad_norm": 0.3889519274234772, "learning_rate": 0.0002, "epoch": 1.3806970509383378, "step": 1030}, {"loss": 1.8023, "grad_norm": 0.3664555251598358, "learning_rate": 0.0002, "epoch": 1.3941018766756033, "step": 1040}, {"loss": 1.7961, "grad_norm": 0.38175567984580994, "learning_rate": 0.0002, "epoch": 1.4075067024128687, "step": 1050}, {"loss": 1.7363, "grad_norm": 0.42346763610839844, "learning_rate": 0.0002, "epoch": 1.420911528150134, "step": 1060}, {"loss": 1.708, "grad_norm": 0.3456033170223236, "learning_rate": 0.0002, "epoch": 1.4343163538873995, "step": 1070}, {"loss": 1.6846, "grad_norm": 0.38931941986083984, "learning_rate": 0.0002, "epoch": 1.447721179624665, "step": 1080}, {"loss": 1.7416, "grad_norm": 0.5473279356956482, "learning_rate": 0.0002, "epoch": 1.4611260053619302, "step": 1090}, {"loss": 1.6927, "grad_norm": 0.3517422676086426, "learning_rate": 0.0002, "epoch": 1.4745308310991958, "step": 1100}, {"loss": 1.7213, "grad_norm": 0.3511943221092224, "learning_rate": 0.0002, "epoch": 1.487935656836461, "step": 1110}, {"loss": 1.7947, "grad_norm": 0.3762837052345276, "learning_rate": 0.0002, "epoch": 1.5013404825737267, "step": 1120}, {"loss": 1.6893, "grad_norm": 0.37149128317832947, "learning_rate": 0.0002, "epoch": 1.5147453083109919, "step": 1130}, {"loss": 1.6944, "grad_norm": 0.3945842981338501, "learning_rate": 0.0002, "epoch": 1.5281501340482575, "step": 1140}, {"loss": 1.7254, "grad_norm": 0.40258195996284485, "learning_rate": 0.0002, "epoch": 1.5415549597855227, "step": 1150}, {"loss": 1.6798, "grad_norm": 0.3959120213985443, "learning_rate": 0.0002, "epoch": 1.5549597855227884, "step": 1160}, {"loss": 1.7789, "grad_norm": 0.37792712450027466, "learning_rate": 0.0002, "epoch": 1.5683646112600536, "step": 1170}, {"loss": 1.7953, "grad_norm": 0.4019201099872589, "learning_rate": 0.0002, "epoch": 1.5817694369973192, "step": 1180}, {"loss": 1.6887, "grad_norm": 0.40712273120880127, "learning_rate": 0.0002, "epoch": 1.5951742627345844, "step": 1190}, {"loss": 1.7131, "grad_norm": 0.4131423234939575, "learning_rate": 0.0002, "epoch": 1.6085790884718498, "step": 1200}, {"loss": 1.6757, "grad_norm": 0.3738194704055786, "learning_rate": 0.0002, "epoch": 1.6219839142091153, "step": 1210}, {"loss": 1.7629, "grad_norm": 0.3987765908241272, "learning_rate": 0.0002, "epoch": 1.6353887399463807, "step": 1220}, {"loss": 1.7374, "grad_norm": 0.34117406606674194, "learning_rate": 0.0002, "epoch": 1.648793565683646, "step": 1230}, {"loss": 1.7869, "grad_norm": 0.34900516271591187, "learning_rate": 0.0002, "epoch": 1.6621983914209115, "step": 1240}, {"loss": 1.7162, "grad_norm": 0.35759788751602173, "learning_rate": 0.0002, "epoch": 1.675603217158177, "step": 1250}, {"loss": 1.7697, "grad_norm": 0.3837822377681732, "learning_rate": 0.0002, "epoch": 1.6890080428954424, "step": 1260}, {"loss": 1.7972, "grad_norm": 0.3671180307865143, "learning_rate": 0.0002, "epoch": 1.7024128686327078, "step": 1270}, {"loss": 1.7198, "grad_norm": 0.4124658703804016, "learning_rate": 0.0002, "epoch": 1.7158176943699732, "step": 1280}, {"loss": 1.8006, "grad_norm": 0.39059901237487793, "learning_rate": 0.0002, "epoch": 1.7292225201072386, "step": 1290}, {"loss": 1.7721, "grad_norm": 0.4006287157535553, "learning_rate": 0.0002, "epoch": 1.742627345844504, "step": 1300}, {"loss": 1.8196, "grad_norm": 0.3606216013431549, "learning_rate": 0.0002, "epoch": 1.7560321715817695, "step": 1310}, {"loss": 1.7213, "grad_norm": 0.3861924111843109, "learning_rate": 0.0002, "epoch": 1.7694369973190347, "step": 1320}, {"loss": 1.7849, "grad_norm": 0.41432589292526245, "learning_rate": 0.0002, "epoch": 1.7828418230563003, "step": 1330}, {"loss": 1.7069, "grad_norm": 0.3751705586910248, "learning_rate": 0.0002, "epoch": 1.7962466487935655, "step": 1340}, {"loss": 1.717, "grad_norm": 0.36217355728149414, "learning_rate": 0.0002, "epoch": 1.8096514745308312, "step": 1350}, {"loss": 1.7878, "grad_norm": 0.35937434434890747, "learning_rate": 0.0002, "epoch": 1.8230563002680964, "step": 1360}, {"loss": 1.7026, "grad_norm": 0.36120304465293884, "learning_rate": 0.0002, "epoch": 1.836461126005362, "step": 1370}, {"loss": 1.7378, "grad_norm": 0.36082401871681213, "learning_rate": 0.0002, "epoch": 1.8498659517426272, "step": 1380}, {"loss": 1.6938, "grad_norm": 0.3616413176059723, "learning_rate": 0.0002, "epoch": 1.863270777479893, "step": 1390}, {"loss": 1.6998, "grad_norm": 0.3664911091327667, "learning_rate": 0.0002, "epoch": 1.876675603217158, "step": 1400}, {"loss": 1.7548, "grad_norm": 0.3545122444629669, "learning_rate": 0.0002, "epoch": 1.8900804289544237, "step": 1410}, {"loss": 1.727, "grad_norm": 0.38186976313591003, "learning_rate": 0.0002, "epoch": 1.903485254691689, "step": 1420}, {"loss": 1.788, "grad_norm": 0.41099944710731506, "learning_rate": 0.0002, "epoch": 1.9168900804289544, "step": 1430}, {"loss": 1.7377, "grad_norm": 0.34538620710372925, "learning_rate": 0.0002, "epoch": 1.9302949061662198, "step": 1440}, {"loss": 1.7349, "grad_norm": 0.35443663597106934, "learning_rate": 0.0002, "epoch": 1.9436997319034852, "step": 1450}, {"loss": 1.7457, "grad_norm": 0.4783519208431244, "learning_rate": 0.0002, "epoch": 1.9571045576407506, "step": 1460}, {"loss": 1.7073, "grad_norm": 0.36285310983657837, "learning_rate": 0.0002, "epoch": 1.970509383378016, "step": 1470}, {"loss": 1.7607, "grad_norm": 0.361730694770813, "learning_rate": 0.0002, "epoch": 1.9839142091152815, "step": 1480}, {"loss": 1.7133, "grad_norm": 0.38347867131233215, "learning_rate": 0.0002, "epoch": 1.997319034852547, "step": 1490}, {"eval_loss": 1.8150336742401123, "eval_runtime": 91.1797, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.713, "epoch": 2.0, "step": 1492}, {"loss": 1.6673, "grad_norm": 0.3648935854434967, "learning_rate": 0.0002, "epoch": 2.0107238605898123, "step": 1500}, {"loss": 1.6754, "grad_norm": 0.3521469533443451, "learning_rate": 0.0002, "epoch": 2.0241286863270775, "step": 1510}, {"loss": 1.5775, "grad_norm": 0.4275520145893097, "learning_rate": 0.0002, "epoch": 2.037533512064343, "step": 1520}, {"loss": 1.5932, "grad_norm": 0.4140888750553131, "learning_rate": 0.0002, "epoch": 2.0509383378016084, "step": 1530}, {"loss": 1.6237, "grad_norm": 0.37715452909469604, "learning_rate": 0.0002, "epoch": 2.064343163538874, "step": 1540}, {"loss": 1.6426, "grad_norm": 0.4375513195991516, "learning_rate": 0.0002, "epoch": 2.0777479892761392, "step": 1550}, {"loss": 1.6675, "grad_norm": 0.44963088631629944, "learning_rate": 0.0002, "epoch": 2.091152815013405, "step": 1560}, {"loss": 1.6731, "grad_norm": 0.45463916659355164, "learning_rate": 0.0002, "epoch": 2.10455764075067, "step": 1570}, {"loss": 1.5928, "grad_norm": 0.3952806293964386, "learning_rate": 0.0002, "epoch": 2.1179624664879357, "step": 1580}, {"loss": 1.6153, "grad_norm": 0.44873616099357605, "learning_rate": 0.0002, "epoch": 2.131367292225201, "step": 1590}, {"loss": 1.5953, "grad_norm": 0.45529067516326904, "learning_rate": 0.0002, "epoch": 2.1447721179624666, "step": 1600}, {"loss": 1.634, "grad_norm": 0.4483625590801239, "learning_rate": 0.0002, "epoch": 2.158176943699732, "step": 1610}, {"loss": 1.6202, "grad_norm": 0.3954690992832184, "learning_rate": 0.0002, "epoch": 2.1715817694369974, "step": 1620}, {"loss": 1.6657, "grad_norm": 0.4297006130218506, "learning_rate": 0.0002, "epoch": 2.1849865951742626, "step": 1630}, {"loss": 1.5499, "grad_norm": 0.4121869206428528, "learning_rate": 0.0002, "epoch": 2.1983914209115283, "step": 1640}, {"loss": 1.6017, "grad_norm": 0.45843517780303955, "learning_rate": 0.0002, "epoch": 2.2117962466487935, "step": 1650}, {"loss": 1.6699, "grad_norm": 0.44742295145988464, "learning_rate": 0.0002, "epoch": 2.225201072386059, "step": 1660}, {"loss": 1.6879, "grad_norm": 0.500198483467102, "learning_rate": 0.0002, "epoch": 2.2386058981233243, "step": 1670}, {"loss": 1.6362, "grad_norm": 0.4322265386581421, "learning_rate": 0.0002, "epoch": 2.25201072386059, "step": 1680}, {"loss": 1.6486, "grad_norm": 0.480289101600647, "learning_rate": 0.0002, "epoch": 2.265415549597855, "step": 1690}, {"loss": 1.6396, "grad_norm": 0.4532500207424164, "learning_rate": 0.0002, "epoch": 2.278820375335121, "step": 1700}, {"loss": 1.6088, "grad_norm": 0.41848474740982056, "learning_rate": 0.0002, "epoch": 2.292225201072386, "step": 1710}, {"loss": 1.6447, "grad_norm": 0.47211962938308716, "learning_rate": 0.0002, "epoch": 2.3056300268096512, "step": 1720}, {"loss": 1.7174, "grad_norm": 0.4273032248020172, "learning_rate": 0.0002, "epoch": 2.319034852546917, "step": 1730}, {"loss": 1.617, "grad_norm": 0.4660373330116272, "learning_rate": 0.0002, "epoch": 2.3324396782841825, "step": 1740}, {"loss": 1.6036, "grad_norm": 0.4409862756729126, "learning_rate": 0.0002, "epoch": 2.3458445040214477, "step": 1750}, {"loss": 1.6579, "grad_norm": 0.44795849919319153, "learning_rate": 0.0002, "epoch": 2.359249329758713, "step": 1760}, {"loss": 1.5736, "grad_norm": 0.4470100402832031, "learning_rate": 0.0002, "epoch": 2.3726541554959786, "step": 1770}, {"loss": 1.6277, "grad_norm": 0.4184521436691284, "learning_rate": 0.0002, "epoch": 2.386058981233244, "step": 1780}, {"loss": 1.6654, "grad_norm": 0.4572308659553528, "learning_rate": 0.0002, "epoch": 2.3994638069705094, "step": 1790}, {"loss": 1.6714, "grad_norm": 0.4888782501220703, "learning_rate": 0.0002, "epoch": 2.4128686327077746, "step": 1800}, {"loss": 1.7168, "grad_norm": 0.4442083239555359, "learning_rate": 0.0002, "epoch": 2.4262734584450403, "step": 1810}, {"loss": 1.6375, "grad_norm": 0.4986329972743988, "learning_rate": 0.0002, "epoch": 2.4396782841823055, "step": 1820}, {"loss": 1.6881, "grad_norm": 0.47918054461479187, "learning_rate": 0.0002, "epoch": 2.453083109919571, "step": 1830}, {"loss": 1.5969, "grad_norm": 0.42569679021835327, "learning_rate": 0.0002, "epoch": 2.4664879356568363, "step": 1840}, {"loss": 1.5751, "grad_norm": 0.4683821201324463, "learning_rate": 0.0002, "epoch": 2.479892761394102, "step": 1850}, {"loss": 1.6004, "grad_norm": 0.43605074286460876, "learning_rate": 0.0002, "epoch": 2.493297587131367, "step": 1860}, {"loss": 1.6885, "grad_norm": 0.4189167618751526, "learning_rate": 0.0002, "epoch": 2.506702412868633, "step": 1870}, {"loss": 1.6493, "grad_norm": 0.5860861539840698, "learning_rate": 0.0002, "epoch": 2.520107238605898, "step": 1880}, {"loss": 1.6563, "grad_norm": 0.4568740427494049, "learning_rate": 0.0002, "epoch": 2.5335120643431637, "step": 1890}, {"loss": 1.6653, "grad_norm": 0.4672846496105194, "learning_rate": 0.0002, "epoch": 2.546916890080429, "step": 1900}, {"loss": 1.6037, "grad_norm": 0.4280472993850708, "learning_rate": 0.0002, "epoch": 2.5603217158176945, "step": 1910}, {"loss": 1.5721, "grad_norm": 0.590728759765625, "learning_rate": 0.0002, "epoch": 2.5737265415549597, "step": 1920}, {"loss": 1.6567, "grad_norm": 0.4205126166343689, "learning_rate": 0.0002, "epoch": 2.5871313672922254, "step": 1930}, {"loss": 1.5045, "grad_norm": 0.47869905829429626, "learning_rate": 0.0002, "epoch": 2.6005361930294906, "step": 1940}, {"loss": 1.5973, "grad_norm": 0.4607323408126831, "learning_rate": 0.0002, "epoch": 2.6139410187667558, "step": 1950}, {"loss": 1.644, "grad_norm": 0.4762210547924042, "learning_rate": 0.0002, "epoch": 2.6273458445040214, "step": 1960}, {"loss": 1.6316, "grad_norm": 0.46832647919654846, "learning_rate": 0.0002, "epoch": 2.640750670241287, "step": 1970}, {"loss": 1.6591, "grad_norm": 0.4368574619293213, "learning_rate": 0.0002, "epoch": 2.6541554959785523, "step": 1980}, {"loss": 1.6359, "grad_norm": 0.5248273611068726, "learning_rate": 0.0002, "epoch": 2.6675603217158175, "step": 1990}, {"loss": 1.6879, "grad_norm": 0.46777117252349854, "learning_rate": 0.0002, "epoch": 2.680965147453083, "step": 2000}, {"loss": 1.7248, "grad_norm": 0.5201858878135681, "learning_rate": 0.0002, "epoch": 2.6943699731903488, "step": 2010}, {"loss": 1.6337, "grad_norm": 0.46777284145355225, "learning_rate": 0.0002, "epoch": 2.707774798927614, "step": 2020}, {"loss": 1.6369, "grad_norm": 0.46736642718315125, "learning_rate": 0.0002, "epoch": 2.721179624664879, "step": 2030}, {"loss": 1.6356, "grad_norm": 0.4647925794124603, "learning_rate": 0.0002, "epoch": 2.734584450402145, "step": 2040}, {"loss": 1.732, "grad_norm": 0.4298803508281708, "learning_rate": 0.0002, "epoch": 2.7479892761394105, "step": 2050}, {"loss": 1.6648, "grad_norm": 0.45485609769821167, "learning_rate": 0.0002, "epoch": 2.7613941018766757, "step": 2060}, {"loss": 1.6706, "grad_norm": 0.43687865138053894, "learning_rate": 0.0002, "epoch": 2.774798927613941, "step": 2070}, {"loss": 1.6904, "grad_norm": 0.4319164752960205, "learning_rate": 0.0002, "epoch": 2.7882037533512065, "step": 2080}, {"loss": 1.6531, "grad_norm": 0.47792428731918335, "learning_rate": 0.0002, "epoch": 2.8016085790884717, "step": 2090}, {"loss": 1.6417, "grad_norm": 0.5322234034538269, "learning_rate": 0.0002, "epoch": 2.8150134048257374, "step": 2100}, {"loss": 1.6634, "grad_norm": 0.47517943382263184, "learning_rate": 0.0002, "epoch": 2.8284182305630026, "step": 2110}, {"loss": 1.6329, "grad_norm": 0.45799025893211365, "learning_rate": 0.0002, "epoch": 2.841823056300268, "step": 2120}, {"loss": 1.6594, "grad_norm": 0.45852357149124146, "learning_rate": 0.0002, "epoch": 2.8552278820375334, "step": 2130}, {"loss": 1.61, "grad_norm": 0.4617408514022827, "learning_rate": 0.0002, "epoch": 2.868632707774799, "step": 2140}, {"loss": 1.6445, "grad_norm": 0.44205963611602783, "learning_rate": 0.0002, "epoch": 2.8820375335120643, "step": 2150}, {"loss": 1.6231, "grad_norm": 0.47173425555229187, "learning_rate": 0.0002, "epoch": 2.89544235924933, "step": 2160}, {"loss": 1.6425, "grad_norm": 0.46379899978637695, "learning_rate": 0.0002, "epoch": 2.908847184986595, "step": 2170}, {"loss": 1.6403, "grad_norm": 0.4999759793281555, "learning_rate": 0.0002, "epoch": 2.9222520107238603, "step": 2180}, {"loss": 1.6741, "grad_norm": 0.4607947766780853, "learning_rate": 0.0002, "epoch": 2.935656836461126, "step": 2190}, {"loss": 1.6889, "grad_norm": 0.4359836280345917, "learning_rate": 0.0002, "epoch": 2.9490616621983916, "step": 2200}, {"loss": 1.6478, "grad_norm": 0.5195549726486206, "learning_rate": 0.0002, "epoch": 2.962466487935657, "step": 2210}, {"loss": 1.6348, "grad_norm": 0.4914056062698364, "learning_rate": 0.0002, "epoch": 2.975871313672922, "step": 2220}, {"loss": 1.6594, "grad_norm": 0.4647377133369446, "learning_rate": 0.0002, "epoch": 2.9892761394101877, "step": 2230}, {"eval_loss": 1.8368606567382812, "eval_runtime": 90.5623, "eval_samples_per_second": 5.687, "eval_steps_per_second": 0.718, "epoch": 3.0, "step": 2238}, {"loss": 1.5704, "grad_norm": 0.40689945220947266, "learning_rate": 0.0002, "epoch": 3.002680965147453, "step": 2240}, {"loss": 1.5961, "grad_norm": 0.4699273705482483, "learning_rate": 0.0002, "epoch": 3.0160857908847185, "step": 2250}, {"loss": 1.5182, "grad_norm": 0.5531830787658691, "learning_rate": 0.0002, "epoch": 3.0294906166219837, "step": 2260}, {"loss": 1.4924, "grad_norm": 0.5441790223121643, "learning_rate": 0.0002, "epoch": 3.0428954423592494, "step": 2270}, {"loss": 1.4953, "grad_norm": 0.6145012974739075, "learning_rate": 0.0002, "epoch": 3.0563002680965146, "step": 2280}, {"loss": 1.4861, "grad_norm": 0.6997102499008179, "learning_rate": 0.0002, "epoch": 3.06970509383378, "step": 2290}, {"loss": 1.5853, "grad_norm": 0.6082330942153931, "learning_rate": 0.0002, "epoch": 3.0831099195710454, "step": 2300}, {"loss": 1.5377, "grad_norm": 0.5294155478477478, "learning_rate": 0.0002, "epoch": 3.096514745308311, "step": 2310}, {"loss": 1.5452, "grad_norm": 0.7200340032577515, "learning_rate": 0.0002, "epoch": 3.1099195710455763, "step": 2320}, {"loss": 1.5296, "grad_norm": 0.721092939376831, "learning_rate": 0.0002, "epoch": 3.123324396782842, "step": 2330}, {"loss": 1.5307, "grad_norm": 0.5344305038452148, "learning_rate": 0.0002, "epoch": 3.136729222520107, "step": 2340}, {"loss": 1.4347, "grad_norm": 0.5533145070075989, "learning_rate": 0.0002, "epoch": 3.1501340482573728, "step": 2350}, {"loss": 1.529, "grad_norm": 0.5976856350898743, "learning_rate": 0.0002, "epoch": 3.163538873994638, "step": 2360}, {"loss": 1.6044, "grad_norm": 0.4974960386753082, "learning_rate": 0.0002, "epoch": 3.1769436997319036, "step": 2370}, {"loss": 1.5554, "grad_norm": 0.6377840042114258, "learning_rate": 0.0002, "epoch": 3.190348525469169, "step": 2380}, {"loss": 1.5322, "grad_norm": 0.5447293519973755, "learning_rate": 0.0002, "epoch": 3.2037533512064345, "step": 2390}, {"loss": 1.5127, "grad_norm": 0.49577030539512634, "learning_rate": 0.0002, "epoch": 3.2171581769436997, "step": 2400}, {"loss": 1.4768, "grad_norm": 0.5588275790214539, "learning_rate": 0.0002, "epoch": 3.2305630026809653, "step": 2410}, {"loss": 1.4755, "grad_norm": 0.6429149508476257, "learning_rate": 0.0002, "epoch": 3.2439678284182305, "step": 2420}, {"loss": 1.5596, "grad_norm": 0.5713154673576355, "learning_rate": 0.0002, "epoch": 3.257372654155496, "step": 2430}, {"loss": 1.4763, "grad_norm": 0.6348955035209656, "learning_rate": 0.0002, "epoch": 3.2707774798927614, "step": 2440}, {"loss": 1.509, "grad_norm": 0.5675528645515442, "learning_rate": 0.0002, "epoch": 3.284182305630027, "step": 2450}, {"loss": 1.5867, "grad_norm": 0.5570188164710999, "learning_rate": 0.0002, "epoch": 3.297587131367292, "step": 2460}, {"loss": 1.554, "grad_norm": 0.6029602289199829, "learning_rate": 0.0002, "epoch": 3.310991957104558, "step": 2470}, {"loss": 1.5094, "grad_norm": 0.523206353187561, "learning_rate": 0.0002, "epoch": 3.324396782841823, "step": 2480}, {"loss": 1.4854, "grad_norm": 0.5912408828735352, "learning_rate": 0.0002, "epoch": 3.3378016085790883, "step": 2490}, {"loss": 1.5097, "grad_norm": 0.5524865984916687, "learning_rate": 0.0002, "epoch": 3.351206434316354, "step": 2500}, {"loss": 1.5064, "grad_norm": 0.60386061668396, "learning_rate": 0.0002, "epoch": 3.3646112600536195, "step": 2510}, {"loss": 1.564, "grad_norm": 0.5838595628738403, "learning_rate": 0.0002, "epoch": 3.3780160857908847, "step": 2520}, {"loss": 1.4615, "grad_norm": 0.5400974154472351, "learning_rate": 0.0002, "epoch": 3.39142091152815, "step": 2530}, {"loss": 1.5349, "grad_norm": 0.6150162220001221, "learning_rate": 0.0002, "epoch": 3.4048257372654156, "step": 2540}, {"loss": 1.5978, "grad_norm": 0.5279412269592285, "learning_rate": 0.0002, "epoch": 3.418230563002681, "step": 2550}, {"loss": 1.5063, "grad_norm": 0.5974063873291016, "learning_rate": 0.0002, "epoch": 3.4316353887399464, "step": 2560}, {"loss": 1.5825, "grad_norm": 0.661573052406311, "learning_rate": 0.0002, "epoch": 3.4450402144772116, "step": 2570}, {"loss": 1.5204, "grad_norm": 0.577880322933197, "learning_rate": 0.0002, "epoch": 3.4584450402144773, "step": 2580}, {"loss": 1.5295, "grad_norm": 0.5532318949699402, "learning_rate": 0.0002, "epoch": 3.4718498659517425, "step": 2590}, {"loss": 1.4933, "grad_norm": 0.5764921307563782, "learning_rate": 0.0002, "epoch": 3.485254691689008, "step": 2600}, {"loss": 1.4355, "grad_norm": 0.6145682334899902, "learning_rate": 0.0002, "epoch": 3.4986595174262733, "step": 2610}, {"loss": 1.4968, "grad_norm": 0.6561126112937927, "learning_rate": 0.0002, "epoch": 3.512064343163539, "step": 2620}, {"loss": 1.5309, "grad_norm": 0.5673288106918335, "learning_rate": 0.0002, "epoch": 3.525469168900804, "step": 2630}, {"loss": 1.5274, "grad_norm": 0.6215338706970215, "learning_rate": 0.0002, "epoch": 3.53887399463807, "step": 2640}, {"loss": 1.5117, "grad_norm": 0.5512040853500366, "learning_rate": 0.0002, "epoch": 3.552278820375335, "step": 2650}, {"loss": 1.5188, "grad_norm": 0.49503496289253235, "learning_rate": 0.0002, "epoch": 3.5656836461126007, "step": 2660}, {"loss": 1.524, "grad_norm": 0.5714912414550781, "learning_rate": 0.0002, "epoch": 3.579088471849866, "step": 2670}, {"loss": 1.4651, "grad_norm": 0.6883154511451721, "learning_rate": 0.0002, "epoch": 3.592493297587131, "step": 2680}, {"loss": 1.5174, "grad_norm": 0.5989556908607483, "learning_rate": 0.0002, "epoch": 3.6058981233243967, "step": 2690}, {"loss": 1.5335, "grad_norm": 0.630268394947052, "learning_rate": 0.0002, "epoch": 3.6193029490616624, "step": 2700}, {"loss": 1.4681, "grad_norm": 0.5819358229637146, "learning_rate": 0.0002, "epoch": 3.6327077747989276, "step": 2710}, {"loss": 1.5676, "grad_norm": 0.6102097034454346, "learning_rate": 0.0002, "epoch": 3.646112600536193, "step": 2720}, {"loss": 1.5566, "grad_norm": 0.6858501434326172, "learning_rate": 0.0002, "epoch": 3.6595174262734584, "step": 2730}, {"loss": 1.5242, "grad_norm": 0.6328608393669128, "learning_rate": 0.0002, "epoch": 3.672922252010724, "step": 2740}, {"loss": 1.5211, "grad_norm": 0.5366981029510498, "learning_rate": 0.0002, "epoch": 3.6863270777479893, "step": 2750}, {"loss": 1.5532, "grad_norm": 0.7048938274383545, "learning_rate": 0.0002, "epoch": 3.6997319034852545, "step": 2760}, {"loss": 1.5001, "grad_norm": 0.5371938347816467, "learning_rate": 0.0002, "epoch": 3.71313672922252, "step": 2770}, {"loss": 1.557, "grad_norm": 0.6142212152481079, "learning_rate": 0.0002, "epoch": 3.726541554959786, "step": 2780}, {"loss": 1.5191, "grad_norm": 0.6164522171020508, "learning_rate": 0.0002, "epoch": 3.739946380697051, "step": 2790}, {"loss": 1.5071, "grad_norm": 0.7511836886405945, "learning_rate": 0.0002, "epoch": 3.753351206434316, "step": 2800}, {"loss": 1.5775, "grad_norm": 0.6194717288017273, "learning_rate": 0.0002, "epoch": 3.766756032171582, "step": 2810}, {"loss": 1.5721, "grad_norm": 0.676721453666687, "learning_rate": 0.0002, "epoch": 3.780160857908847, "step": 2820}, {"loss": 1.502, "grad_norm": 0.5646911263465881, "learning_rate": 0.0002, "epoch": 3.7935656836461127, "step": 2830}, {"loss": 1.4871, "grad_norm": 0.5874826908111572, "learning_rate": 0.0002, "epoch": 3.806970509383378, "step": 2840}, {"loss": 1.5046, "grad_norm": 0.6395232677459717, "learning_rate": 0.0002, "epoch": 3.8203753351206435, "step": 2850}, {"loss": 1.5088, "grad_norm": 0.624563992023468, "learning_rate": 0.0002, "epoch": 3.8337801608579087, "step": 2860}, {"loss": 1.479, "grad_norm": 0.59019935131073, "learning_rate": 0.0002, "epoch": 3.8471849865951744, "step": 2870}, {"loss": 1.4693, "grad_norm": 0.6700479984283447, "learning_rate": 0.0002, "epoch": 3.8605898123324396, "step": 2880}, {"loss": 1.5032, "grad_norm": 0.6131282448768616, "learning_rate": 0.0002, "epoch": 3.8739946380697052, "step": 2890}, {"loss": 1.5446, "grad_norm": 0.6807777881622314, "learning_rate": 0.0002, "epoch": 3.8873994638069704, "step": 2900}, {"loss": 1.5618, "grad_norm": 0.5297217965126038, "learning_rate": 0.0002, "epoch": 3.900804289544236, "step": 2910}, {"loss": 1.5046, "grad_norm": 0.5795540809631348, "learning_rate": 0.0002, "epoch": 3.9142091152815013, "step": 2920}, {"loss": 1.5155, "grad_norm": 0.5549747347831726, "learning_rate": 0.0002, "epoch": 3.927613941018767, "step": 2930}, {"loss": 1.5932, "grad_norm": 0.5895092487335205, "learning_rate": 0.0002, "epoch": 3.941018766756032, "step": 2940}, {"loss": 1.5831, "grad_norm": 0.590002715587616, "learning_rate": 0.0002, "epoch": 3.9544235924932973, "step": 2950}, {"loss": 1.592, "grad_norm": 0.7847695350646973, "learning_rate": 0.0002, "epoch": 3.967828418230563, "step": 2960}, {"loss": 1.4892, "grad_norm": 0.5845848321914673, "learning_rate": 0.0002, "epoch": 3.9812332439678286, "step": 2970}, {"loss": 1.5094, "grad_norm": 0.5861571431159973, "learning_rate": 0.0002, "epoch": 3.994638069705094, "step": 2980}, {"eval_loss": 1.8821998834609985, "eval_runtime": 90.8701, "eval_samples_per_second": 5.667, "eval_steps_per_second": 0.715, "epoch": 4.0, "step": 2984}, {"loss": 1.4156, "grad_norm": 0.6209918260574341, "learning_rate": 0.0002, "epoch": 4.008042895442359, "step": 2990}, {"loss": 1.4244, "grad_norm": 0.607226550579071, "learning_rate": 0.0002, "epoch": 4.021447721179625, "step": 3000}, {"loss": 1.3652, "grad_norm": 0.6677961349487305, "learning_rate": 0.0002, "epoch": 4.03485254691689, "step": 3010}, {"loss": 1.3815, "grad_norm": 0.9053248763084412, "learning_rate": 0.0002, "epoch": 4.048257372654155, "step": 3020}, {"loss": 1.4346, "grad_norm": 0.6815084218978882, "learning_rate": 0.0002, "epoch": 4.061662198391421, "step": 3030}, {"loss": 1.3, "grad_norm": 0.6709407567977905, "learning_rate": 0.0002, "epoch": 4.075067024128686, "step": 3040}, {"loss": 1.3406, "grad_norm": 0.728184163570404, "learning_rate": 0.0002, "epoch": 4.088471849865952, "step": 3050}, {"loss": 1.3404, "grad_norm": 0.817628800868988, "learning_rate": 0.0002, "epoch": 4.101876675603217, "step": 3060}, {"loss": 1.3496, "grad_norm": 0.7384206056594849, "learning_rate": 0.0002, "epoch": 4.115281501340482, "step": 3070}, {"loss": 1.3621, "grad_norm": 0.7380280494689941, "learning_rate": 0.0002, "epoch": 4.128686327077748, "step": 3080}, {"loss": 1.3425, "grad_norm": 0.8197277188301086, "learning_rate": 0.0002, "epoch": 4.142091152815014, "step": 3090}, {"loss": 1.3761, "grad_norm": 0.8971617817878723, "learning_rate": 0.0002, "epoch": 4.1554959785522785, "step": 3100}, {"loss": 1.3564, "grad_norm": 0.7409387826919556, "learning_rate": 0.0002, "epoch": 4.168900804289544, "step": 3110}, {"loss": 1.3675, "grad_norm": 0.6948909163475037, "learning_rate": 0.0002, "epoch": 4.18230563002681, "step": 3120}, {"loss": 1.3397, "grad_norm": 0.7619595527648926, "learning_rate": 0.0002, "epoch": 4.195710455764075, "step": 3130}, {"loss": 1.3864, "grad_norm": 0.7657106518745422, "learning_rate": 0.0002, "epoch": 4.20911528150134, "step": 3140}, {"loss": 1.4017, "grad_norm": 0.6919401288032532, "learning_rate": 0.0002, "epoch": 4.222520107238606, "step": 3150}, {"loss": 1.3692, "grad_norm": 0.6991415023803711, "learning_rate": 0.0002, "epoch": 4.2359249329758715, "step": 3160}, {"loss": 1.3651, "grad_norm": 0.7349252700805664, "learning_rate": 0.0002, "epoch": 4.249329758713137, "step": 3170}, {"loss": 1.367, "grad_norm": 0.8838240504264832, "learning_rate": 0.0002, "epoch": 4.262734584450402, "step": 3180}, {"loss": 1.4254, "grad_norm": 0.7240107655525208, "learning_rate": 0.0002, "epoch": 4.2761394101876675, "step": 3190}, {"loss": 1.3671, "grad_norm": 0.7338636517524719, "learning_rate": 0.0002, "epoch": 4.289544235924933, "step": 3200}, {"loss": 1.448, "grad_norm": 0.7891436815261841, "learning_rate": 0.0002, "epoch": 4.302949061662199, "step": 3210}, {"loss": 1.3291, "grad_norm": 0.7407845854759216, "learning_rate": 0.0002, "epoch": 4.316353887399464, "step": 3220}, {"loss": 1.3899, "grad_norm": 0.7635948061943054, "learning_rate": 0.0002, "epoch": 4.329758713136729, "step": 3230}, {"loss": 1.3384, "grad_norm": 0.7478461861610413, "learning_rate": 0.0002, "epoch": 4.343163538873995, "step": 3240}, {"loss": 1.388, "grad_norm": 0.7684298157691956, "learning_rate": 0.0002, "epoch": 4.35656836461126, "step": 3250}, {"loss": 1.4233, "grad_norm": 1.0287525653839111, "learning_rate": 0.0002, "epoch": 4.369973190348525, "step": 3260}, {"loss": 1.3542, "grad_norm": 0.750616192817688, "learning_rate": 0.0002, "epoch": 4.383378016085791, "step": 3270}, {"loss": 1.3158, "grad_norm": 0.7911648750305176, "learning_rate": 0.0002, "epoch": 4.396782841823057, "step": 3280}, {"loss": 1.3896, "grad_norm": 0.9156750440597534, "learning_rate": 0.0002, "epoch": 4.410187667560321, "step": 3290}, {"loss": 1.3887, "grad_norm": 1.0180249214172363, "learning_rate": 0.0002, "epoch": 4.423592493297587, "step": 3300}, {"loss": 1.4143, "grad_norm": 1.0792218446731567, "learning_rate": 0.0002, "epoch": 4.436997319034853, "step": 3310}, {"loss": 1.3314, "grad_norm": 0.8027488589286804, "learning_rate": 0.0002, "epoch": 4.450402144772118, "step": 3320}, {"loss": 1.4144, "grad_norm": 0.8037815093994141, "learning_rate": 0.0002, "epoch": 4.463806970509383, "step": 3330}, {"loss": 1.4124, "grad_norm": 0.7907946705818176, "learning_rate": 0.0002, "epoch": 4.477211796246649, "step": 3340}, {"loss": 1.443, "grad_norm": 0.7206302881240845, "learning_rate": 0.0002, "epoch": 4.490616621983914, "step": 3350}, {"loss": 1.3822, "grad_norm": 0.7697674632072449, "learning_rate": 0.0002, "epoch": 4.50402144772118, "step": 3360}, {"loss": 1.3923, "grad_norm": 0.7315130829811096, "learning_rate": 0.0002, "epoch": 4.517426273458445, "step": 3370}, {"loss": 1.3598, "grad_norm": 0.7896273136138916, "learning_rate": 0.0002, "epoch": 4.53083109919571, "step": 3380}, {"loss": 1.3947, "grad_norm": 0.7720345258712769, "learning_rate": 0.0002, "epoch": 4.544235924932976, "step": 3390}, {"loss": 1.404, "grad_norm": 0.8304631114006042, "learning_rate": 0.0002, "epoch": 4.557640750670242, "step": 3400}, {"loss": 1.3712, "grad_norm": 0.7408214211463928, "learning_rate": 0.0002, "epoch": 4.571045576407506, "step": 3410}, {"loss": 1.3957, "grad_norm": 0.8100157976150513, "learning_rate": 0.0002, "epoch": 4.584450402144772, "step": 3420}, {"loss": 1.47, "grad_norm": 0.7829574942588806, "learning_rate": 0.0002, "epoch": 4.597855227882038, "step": 3430}, {"loss": 1.3684, "grad_norm": 0.9529728889465332, "learning_rate": 0.0002, "epoch": 4.6112600536193025, "step": 3440}, {"loss": 1.3984, "grad_norm": 1.0769460201263428, "learning_rate": 0.0002, "epoch": 4.624664879356568, "step": 3450}, {"loss": 1.4063, "grad_norm": 0.8941947817802429, "learning_rate": 0.0002, "epoch": 4.638069705093834, "step": 3460}, {"loss": 1.4421, "grad_norm": 0.7860096096992493, "learning_rate": 0.0002, "epoch": 4.651474530831099, "step": 3470}, {"loss": 1.3782, "grad_norm": 0.8184044361114502, "learning_rate": 0.0002, "epoch": 4.664879356568365, "step": 3480}, {"loss": 1.3885, "grad_norm": 0.7852717638015747, "learning_rate": 0.0002, "epoch": 4.67828418230563, "step": 3490}, {"loss": 1.4139, "grad_norm": 0.750586986541748, "learning_rate": 0.0002, "epoch": 4.6916890080428955, "step": 3500}, {"loss": 1.3224, "grad_norm": 0.7966068983078003, "learning_rate": 0.0002, "epoch": 4.705093833780161, "step": 3510}, {"loss": 1.4052, "grad_norm": 0.8387030959129333, "learning_rate": 0.0002, "epoch": 4.718498659517426, "step": 3520}, {"loss": 1.4541, "grad_norm": 0.7373180389404297, "learning_rate": 0.0002, "epoch": 4.7319034852546915, "step": 3530}, {"loss": 1.4148, "grad_norm": 0.8415353894233704, "learning_rate": 0.0002, "epoch": 4.745308310991957, "step": 3540}, {"loss": 1.4236, "grad_norm": 0.7155488133430481, "learning_rate": 0.0002, "epoch": 4.758713136729223, "step": 3550}, {"loss": 1.3454, "grad_norm": 0.697658896446228, "learning_rate": 0.0002, "epoch": 4.772117962466488, "step": 3560}, {"loss": 1.4002, "grad_norm": 0.8722999095916748, "learning_rate": 0.0002, "epoch": 4.785522788203753, "step": 3570}, {"loss": 1.4224, "grad_norm": 0.8106381297111511, "learning_rate": 0.0002, "epoch": 4.798927613941019, "step": 3580}, {"loss": 1.3525, "grad_norm": 0.9320500493049622, "learning_rate": 0.0002, "epoch": 4.8123324396782845, "step": 3590}, {"loss": 1.3675, "grad_norm": 0.7583016157150269, "learning_rate": 0.0002, "epoch": 4.825737265415549, "step": 3600}, {"loss": 1.3761, "grad_norm": 0.790050208568573, "learning_rate": 0.0002, "epoch": 4.839142091152815, "step": 3610}, {"loss": 1.4144, "grad_norm": 0.7481580972671509, "learning_rate": 0.0002, "epoch": 4.8525469168900806, "step": 3620}, {"loss": 1.4424, "grad_norm": 0.8709374666213989, "learning_rate": 0.0002, "epoch": 4.865951742627346, "step": 3630}, {"loss": 1.3758, "grad_norm": 0.7266733050346375, "learning_rate": 0.0002, "epoch": 4.879356568364611, "step": 3640}, {"loss": 1.4254, "grad_norm": 0.7669504880905151, "learning_rate": 0.0002, "epoch": 4.892761394101877, "step": 3650}, {"loss": 1.3956, "grad_norm": 0.7855764627456665, "learning_rate": 0.0002, "epoch": 4.906166219839142, "step": 3660}, {"loss": 1.4609, "grad_norm": 0.8145440816879272, "learning_rate": 0.0002, "epoch": 4.919571045576408, "step": 3670}, {"loss": 1.4152, "grad_norm": 0.7487278580665588, "learning_rate": 0.0002, "epoch": 4.932975871313673, "step": 3680}, {"loss": 1.4386, "grad_norm": 0.8390981554985046, "learning_rate": 0.0002, "epoch": 4.946380697050938, "step": 3690}, {"loss": 1.3504, "grad_norm": 0.663752555847168, "learning_rate": 0.0002, "epoch": 4.959785522788204, "step": 3700}, {"loss": 1.3453, "grad_norm": 0.7821969985961914, "learning_rate": 0.0002, "epoch": 4.973190348525469, "step": 3710}, {"loss": 1.3936, "grad_norm": 0.9157266020774841, "learning_rate": 0.0002, "epoch": 4.986595174262734, "step": 3720}, {"loss": 1.3925, "grad_norm": 0.7683535814285278, "learning_rate": 0.0002, "epoch": 5.0, "step": 3730}]} +{"epoch": 6.0, "step": 4476, "epoch_duration": 2114.996573448181, "total_accumulated_duration": 12299.494766950607, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 14256.0}, "peak_memory_reserved": {"GPU_0": 15414.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}, {"eval_loss": 1.8168668746948242, "eval_runtime": 90.6336, "eval_samples_per_second": 5.682, "eval_steps_per_second": 0.717, "epoch": 1.0, "step": 746}, {"loss": 1.7741, "grad_norm": 0.2998153269290924, "learning_rate": 0.0002, "epoch": 1.0053619302949062, "step": 750}, {"loss": 1.7897, "grad_norm": 0.34353747963905334, "learning_rate": 0.0002, "epoch": 1.0187667560321716, "step": 760}, {"loss": 1.6997, "grad_norm": 0.3506847321987152, "learning_rate": 0.0002, "epoch": 1.032171581769437, "step": 770}, {"loss": 1.7277, "grad_norm": 0.3434218764305115, "learning_rate": 0.0002, "epoch": 1.0455764075067024, "step": 780}, {"loss": 1.7201, "grad_norm": 0.39283573627471924, "learning_rate": 0.0002, "epoch": 1.0589812332439679, "step": 790}, {"loss": 1.7134, "grad_norm": 0.36534103751182556, "learning_rate": 0.0002, "epoch": 1.0723860589812333, "step": 800}, {"loss": 1.73, "grad_norm": 0.32713210582733154, "learning_rate": 0.0002, "epoch": 1.0857908847184987, "step": 810}, {"loss": 1.733, "grad_norm": 0.4298870861530304, "learning_rate": 0.0002, "epoch": 1.0991957104557641, "step": 820}, {"loss": 1.7152, "grad_norm": 0.3652895987033844, "learning_rate": 0.0002, "epoch": 1.1126005361930296, "step": 830}, {"loss": 1.7952, "grad_norm": 0.4341593086719513, "learning_rate": 0.0002, "epoch": 1.126005361930295, "step": 840}, {"loss": 1.7353, "grad_norm": 0.3925093412399292, "learning_rate": 0.0002, "epoch": 1.1394101876675604, "step": 850}, {"loss": 1.7484, "grad_norm": 0.3695056736469269, "learning_rate": 0.0002, "epoch": 1.1528150134048256, "step": 860}, {"loss": 1.7959, "grad_norm": 0.36138468980789185, "learning_rate": 0.0002, "epoch": 1.1662198391420913, "step": 870}, {"loss": 1.7144, "grad_norm": 0.33074072003364563, "learning_rate": 0.0002, "epoch": 1.1796246648793565, "step": 880}, {"loss": 1.7303, "grad_norm": 0.3552579879760742, "learning_rate": 0.0002, "epoch": 1.193029490616622, "step": 890}, {"loss": 1.6857, "grad_norm": 0.38744238018989563, "learning_rate": 0.0002, "epoch": 1.2064343163538873, "step": 900}, {"loss": 1.7543, "grad_norm": 0.3563305735588074, "learning_rate": 0.0002, "epoch": 1.2198391420911527, "step": 910}, {"loss": 1.7406, "grad_norm": 0.35686084628105164, "learning_rate": 0.0002, "epoch": 1.2332439678284182, "step": 920}, {"loss": 1.765, "grad_norm": 0.4001927077770233, "learning_rate": 0.0002, "epoch": 1.2466487935656836, "step": 930}, {"loss": 1.7147, "grad_norm": 0.35909149050712585, "learning_rate": 0.0002, "epoch": 1.260053619302949, "step": 940}, {"loss": 1.6712, "grad_norm": 0.35123375058174133, "learning_rate": 0.0002, "epoch": 1.2734584450402144, "step": 950}, {"loss": 1.7245, "grad_norm": 0.38013333082199097, "learning_rate": 0.0002, "epoch": 1.2868632707774799, "step": 960}, {"loss": 1.7395, "grad_norm": 0.373146653175354, "learning_rate": 0.0002, "epoch": 1.3002680965147453, "step": 970}, {"loss": 1.707, "grad_norm": 0.4208183288574219, "learning_rate": 0.0002, "epoch": 1.3136729222520107, "step": 980}, {"loss": 1.7122, "grad_norm": 0.3613564074039459, "learning_rate": 0.0002, "epoch": 1.3270777479892761, "step": 990}, {"loss": 1.6776, "grad_norm": 0.34058499336242676, "learning_rate": 0.0002, "epoch": 1.3404825737265416, "step": 1000}, {"loss": 1.7072, "grad_norm": 0.3563075065612793, "learning_rate": 0.0002, "epoch": 1.353887399463807, "step": 1010}, {"loss": 1.7167, "grad_norm": 0.36920854449272156, "learning_rate": 0.0002, "epoch": 1.3672922252010724, "step": 1020}, {"loss": 1.7143, "grad_norm": 0.3889519274234772, "learning_rate": 0.0002, "epoch": 1.3806970509383378, "step": 1030}, {"loss": 1.8023, "grad_norm": 0.3664555251598358, "learning_rate": 0.0002, "epoch": 1.3941018766756033, "step": 1040}, {"loss": 1.7961, "grad_norm": 0.38175567984580994, "learning_rate": 0.0002, "epoch": 1.4075067024128687, "step": 1050}, {"loss": 1.7363, "grad_norm": 0.42346763610839844, "learning_rate": 0.0002, "epoch": 1.420911528150134, "step": 1060}, {"loss": 1.708, "grad_norm": 0.3456033170223236, "learning_rate": 0.0002, "epoch": 1.4343163538873995, "step": 1070}, {"loss": 1.6846, "grad_norm": 0.38931941986083984, "learning_rate": 0.0002, "epoch": 1.447721179624665, "step": 1080}, {"loss": 1.7416, "grad_norm": 0.5473279356956482, "learning_rate": 0.0002, "epoch": 1.4611260053619302, "step": 1090}, {"loss": 1.6927, "grad_norm": 0.3517422676086426, "learning_rate": 0.0002, "epoch": 1.4745308310991958, "step": 1100}, {"loss": 1.7213, "grad_norm": 0.3511943221092224, "learning_rate": 0.0002, "epoch": 1.487935656836461, "step": 1110}, {"loss": 1.7947, "grad_norm": 0.3762837052345276, "learning_rate": 0.0002, "epoch": 1.5013404825737267, "step": 1120}, {"loss": 1.6893, "grad_norm": 0.37149128317832947, "learning_rate": 0.0002, "epoch": 1.5147453083109919, "step": 1130}, {"loss": 1.6944, "grad_norm": 0.3945842981338501, "learning_rate": 0.0002, "epoch": 1.5281501340482575, "step": 1140}, {"loss": 1.7254, "grad_norm": 0.40258195996284485, "learning_rate": 0.0002, "epoch": 1.5415549597855227, "step": 1150}, {"loss": 1.6798, "grad_norm": 0.3959120213985443, "learning_rate": 0.0002, "epoch": 1.5549597855227884, "step": 1160}, {"loss": 1.7789, "grad_norm": 0.37792712450027466, "learning_rate": 0.0002, "epoch": 1.5683646112600536, "step": 1170}, {"loss": 1.7953, "grad_norm": 0.4019201099872589, "learning_rate": 0.0002, "epoch": 1.5817694369973192, "step": 1180}, {"loss": 1.6887, "grad_norm": 0.40712273120880127, "learning_rate": 0.0002, "epoch": 1.5951742627345844, "step": 1190}, {"loss": 1.7131, "grad_norm": 0.4131423234939575, "learning_rate": 0.0002, "epoch": 1.6085790884718498, "step": 1200}, {"loss": 1.6757, "grad_norm": 0.3738194704055786, "learning_rate": 0.0002, "epoch": 1.6219839142091153, "step": 1210}, {"loss": 1.7629, "grad_norm": 0.3987765908241272, "learning_rate": 0.0002, "epoch": 1.6353887399463807, "step": 1220}, {"loss": 1.7374, "grad_norm": 0.34117406606674194, "learning_rate": 0.0002, "epoch": 1.648793565683646, "step": 1230}, {"loss": 1.7869, "grad_norm": 0.34900516271591187, "learning_rate": 0.0002, "epoch": 1.6621983914209115, "step": 1240}, {"loss": 1.7162, "grad_norm": 0.35759788751602173, "learning_rate": 0.0002, "epoch": 1.675603217158177, "step": 1250}, {"loss": 1.7697, "grad_norm": 0.3837822377681732, "learning_rate": 0.0002, "epoch": 1.6890080428954424, "step": 1260}, {"loss": 1.7972, "grad_norm": 0.3671180307865143, "learning_rate": 0.0002, "epoch": 1.7024128686327078, "step": 1270}, {"loss": 1.7198, "grad_norm": 0.4124658703804016, "learning_rate": 0.0002, "epoch": 1.7158176943699732, "step": 1280}, {"loss": 1.8006, "grad_norm": 0.39059901237487793, "learning_rate": 0.0002, "epoch": 1.7292225201072386, "step": 1290}, {"loss": 1.7721, "grad_norm": 0.4006287157535553, "learning_rate": 0.0002, "epoch": 1.742627345844504, "step": 1300}, {"loss": 1.8196, "grad_norm": 0.3606216013431549, "learning_rate": 0.0002, "epoch": 1.7560321715817695, "step": 1310}, {"loss": 1.7213, "grad_norm": 0.3861924111843109, "learning_rate": 0.0002, "epoch": 1.7694369973190347, "step": 1320}, {"loss": 1.7849, "grad_norm": 0.41432589292526245, "learning_rate": 0.0002, "epoch": 1.7828418230563003, "step": 1330}, {"loss": 1.7069, "grad_norm": 0.3751705586910248, "learning_rate": 0.0002, "epoch": 1.7962466487935655, "step": 1340}, {"loss": 1.717, "grad_norm": 0.36217355728149414, "learning_rate": 0.0002, "epoch": 1.8096514745308312, "step": 1350}, {"loss": 1.7878, "grad_norm": 0.35937434434890747, "learning_rate": 0.0002, "epoch": 1.8230563002680964, "step": 1360}, {"loss": 1.7026, "grad_norm": 0.36120304465293884, "learning_rate": 0.0002, "epoch": 1.836461126005362, "step": 1370}, {"loss": 1.7378, "grad_norm": 0.36082401871681213, "learning_rate": 0.0002, "epoch": 1.8498659517426272, "step": 1380}, {"loss": 1.6938, "grad_norm": 0.3616413176059723, "learning_rate": 0.0002, "epoch": 1.863270777479893, "step": 1390}, {"loss": 1.6998, "grad_norm": 0.3664911091327667, "learning_rate": 0.0002, "epoch": 1.876675603217158, "step": 1400}, {"loss": 1.7548, "grad_norm": 0.3545122444629669, "learning_rate": 0.0002, "epoch": 1.8900804289544237, "step": 1410}, {"loss": 1.727, "grad_norm": 0.38186976313591003, "learning_rate": 0.0002, "epoch": 1.903485254691689, "step": 1420}, {"loss": 1.788, "grad_norm": 0.41099944710731506, "learning_rate": 0.0002, "epoch": 1.9168900804289544, "step": 1430}, {"loss": 1.7377, "grad_norm": 0.34538620710372925, "learning_rate": 0.0002, "epoch": 1.9302949061662198, "step": 1440}, {"loss": 1.7349, "grad_norm": 0.35443663597106934, "learning_rate": 0.0002, "epoch": 1.9436997319034852, "step": 1450}, {"loss": 1.7457, "grad_norm": 0.4783519208431244, "learning_rate": 0.0002, "epoch": 1.9571045576407506, "step": 1460}, {"loss": 1.7073, "grad_norm": 0.36285310983657837, "learning_rate": 0.0002, "epoch": 1.970509383378016, "step": 1470}, {"loss": 1.7607, "grad_norm": 0.361730694770813, "learning_rate": 0.0002, "epoch": 1.9839142091152815, "step": 1480}, {"loss": 1.7133, "grad_norm": 0.38347867131233215, "learning_rate": 0.0002, "epoch": 1.997319034852547, "step": 1490}, {"eval_loss": 1.8150336742401123, "eval_runtime": 91.1797, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.713, "epoch": 2.0, "step": 1492}, {"loss": 1.6673, "grad_norm": 0.3648935854434967, "learning_rate": 0.0002, "epoch": 2.0107238605898123, "step": 1500}, {"loss": 1.6754, "grad_norm": 0.3521469533443451, "learning_rate": 0.0002, "epoch": 2.0241286863270775, "step": 1510}, {"loss": 1.5775, "grad_norm": 0.4275520145893097, "learning_rate": 0.0002, "epoch": 2.037533512064343, "step": 1520}, {"loss": 1.5932, "grad_norm": 0.4140888750553131, "learning_rate": 0.0002, "epoch": 2.0509383378016084, "step": 1530}, {"loss": 1.6237, "grad_norm": 0.37715452909469604, "learning_rate": 0.0002, "epoch": 2.064343163538874, "step": 1540}, {"loss": 1.6426, "grad_norm": 0.4375513195991516, "learning_rate": 0.0002, "epoch": 2.0777479892761392, "step": 1550}, {"loss": 1.6675, "grad_norm": 0.44963088631629944, "learning_rate": 0.0002, "epoch": 2.091152815013405, "step": 1560}, {"loss": 1.6731, "grad_norm": 0.45463916659355164, "learning_rate": 0.0002, "epoch": 2.10455764075067, "step": 1570}, {"loss": 1.5928, "grad_norm": 0.3952806293964386, "learning_rate": 0.0002, "epoch": 2.1179624664879357, "step": 1580}, {"loss": 1.6153, "grad_norm": 0.44873616099357605, "learning_rate": 0.0002, "epoch": 2.131367292225201, "step": 1590}, {"loss": 1.5953, "grad_norm": 0.45529067516326904, "learning_rate": 0.0002, "epoch": 2.1447721179624666, "step": 1600}, {"loss": 1.634, "grad_norm": 0.4483625590801239, "learning_rate": 0.0002, "epoch": 2.158176943699732, "step": 1610}, {"loss": 1.6202, "grad_norm": 0.3954690992832184, "learning_rate": 0.0002, "epoch": 2.1715817694369974, "step": 1620}, {"loss": 1.6657, "grad_norm": 0.4297006130218506, "learning_rate": 0.0002, "epoch": 2.1849865951742626, "step": 1630}, {"loss": 1.5499, "grad_norm": 0.4121869206428528, "learning_rate": 0.0002, "epoch": 2.1983914209115283, "step": 1640}, {"loss": 1.6017, "grad_norm": 0.45843517780303955, "learning_rate": 0.0002, "epoch": 2.2117962466487935, "step": 1650}, {"loss": 1.6699, "grad_norm": 0.44742295145988464, "learning_rate": 0.0002, "epoch": 2.225201072386059, "step": 1660}, {"loss": 1.6879, "grad_norm": 0.500198483467102, "learning_rate": 0.0002, "epoch": 2.2386058981233243, "step": 1670}, {"loss": 1.6362, "grad_norm": 0.4322265386581421, "learning_rate": 0.0002, "epoch": 2.25201072386059, "step": 1680}, {"loss": 1.6486, "grad_norm": 0.480289101600647, "learning_rate": 0.0002, "epoch": 2.265415549597855, "step": 1690}, {"loss": 1.6396, "grad_norm": 0.4532500207424164, "learning_rate": 0.0002, "epoch": 2.278820375335121, "step": 1700}, {"loss": 1.6088, "grad_norm": 0.41848474740982056, "learning_rate": 0.0002, "epoch": 2.292225201072386, "step": 1710}, {"loss": 1.6447, "grad_norm": 0.47211962938308716, "learning_rate": 0.0002, "epoch": 2.3056300268096512, "step": 1720}, {"loss": 1.7174, "grad_norm": 0.4273032248020172, "learning_rate": 0.0002, "epoch": 2.319034852546917, "step": 1730}, {"loss": 1.617, "grad_norm": 0.4660373330116272, "learning_rate": 0.0002, "epoch": 2.3324396782841825, "step": 1740}, {"loss": 1.6036, "grad_norm": 0.4409862756729126, "learning_rate": 0.0002, "epoch": 2.3458445040214477, "step": 1750}, {"loss": 1.6579, "grad_norm": 0.44795849919319153, "learning_rate": 0.0002, "epoch": 2.359249329758713, "step": 1760}, {"loss": 1.5736, "grad_norm": 0.4470100402832031, "learning_rate": 0.0002, "epoch": 2.3726541554959786, "step": 1770}, {"loss": 1.6277, "grad_norm": 0.4184521436691284, "learning_rate": 0.0002, "epoch": 2.386058981233244, "step": 1780}, {"loss": 1.6654, "grad_norm": 0.4572308659553528, "learning_rate": 0.0002, "epoch": 2.3994638069705094, "step": 1790}, {"loss": 1.6714, "grad_norm": 0.4888782501220703, "learning_rate": 0.0002, "epoch": 2.4128686327077746, "step": 1800}, {"loss": 1.7168, "grad_norm": 0.4442083239555359, "learning_rate": 0.0002, "epoch": 2.4262734584450403, "step": 1810}, {"loss": 1.6375, "grad_norm": 0.4986329972743988, "learning_rate": 0.0002, "epoch": 2.4396782841823055, "step": 1820}, {"loss": 1.6881, "grad_norm": 0.47918054461479187, "learning_rate": 0.0002, "epoch": 2.453083109919571, "step": 1830}, {"loss": 1.5969, "grad_norm": 0.42569679021835327, "learning_rate": 0.0002, "epoch": 2.4664879356568363, "step": 1840}, {"loss": 1.5751, "grad_norm": 0.4683821201324463, "learning_rate": 0.0002, "epoch": 2.479892761394102, "step": 1850}, {"loss": 1.6004, "grad_norm": 0.43605074286460876, "learning_rate": 0.0002, "epoch": 2.493297587131367, "step": 1860}, {"loss": 1.6885, "grad_norm": 0.4189167618751526, "learning_rate": 0.0002, "epoch": 2.506702412868633, "step": 1870}, {"loss": 1.6493, "grad_norm": 0.5860861539840698, "learning_rate": 0.0002, "epoch": 2.520107238605898, "step": 1880}, {"loss": 1.6563, "grad_norm": 0.4568740427494049, "learning_rate": 0.0002, "epoch": 2.5335120643431637, "step": 1890}, {"loss": 1.6653, "grad_norm": 0.4672846496105194, "learning_rate": 0.0002, "epoch": 2.546916890080429, "step": 1900}, {"loss": 1.6037, "grad_norm": 0.4280472993850708, "learning_rate": 0.0002, "epoch": 2.5603217158176945, "step": 1910}, {"loss": 1.5721, "grad_norm": 0.590728759765625, "learning_rate": 0.0002, "epoch": 2.5737265415549597, "step": 1920}, {"loss": 1.6567, "grad_norm": 0.4205126166343689, "learning_rate": 0.0002, "epoch": 2.5871313672922254, "step": 1930}, {"loss": 1.5045, "grad_norm": 0.47869905829429626, "learning_rate": 0.0002, "epoch": 2.6005361930294906, "step": 1940}, {"loss": 1.5973, "grad_norm": 0.4607323408126831, "learning_rate": 0.0002, "epoch": 2.6139410187667558, "step": 1950}, {"loss": 1.644, "grad_norm": 0.4762210547924042, "learning_rate": 0.0002, "epoch": 2.6273458445040214, "step": 1960}, {"loss": 1.6316, "grad_norm": 0.46832647919654846, "learning_rate": 0.0002, "epoch": 2.640750670241287, "step": 1970}, {"loss": 1.6591, "grad_norm": 0.4368574619293213, "learning_rate": 0.0002, "epoch": 2.6541554959785523, "step": 1980}, {"loss": 1.6359, "grad_norm": 0.5248273611068726, "learning_rate": 0.0002, "epoch": 2.6675603217158175, "step": 1990}, {"loss": 1.6879, "grad_norm": 0.46777117252349854, "learning_rate": 0.0002, "epoch": 2.680965147453083, "step": 2000}, {"loss": 1.7248, "grad_norm": 0.5201858878135681, "learning_rate": 0.0002, "epoch": 2.6943699731903488, "step": 2010}, {"loss": 1.6337, "grad_norm": 0.46777284145355225, "learning_rate": 0.0002, "epoch": 2.707774798927614, "step": 2020}, {"loss": 1.6369, "grad_norm": 0.46736642718315125, "learning_rate": 0.0002, "epoch": 2.721179624664879, "step": 2030}, {"loss": 1.6356, "grad_norm": 0.4647925794124603, "learning_rate": 0.0002, "epoch": 2.734584450402145, "step": 2040}, {"loss": 1.732, "grad_norm": 0.4298803508281708, "learning_rate": 0.0002, "epoch": 2.7479892761394105, "step": 2050}, {"loss": 1.6648, "grad_norm": 0.45485609769821167, "learning_rate": 0.0002, "epoch": 2.7613941018766757, "step": 2060}, {"loss": 1.6706, "grad_norm": 0.43687865138053894, "learning_rate": 0.0002, "epoch": 2.774798927613941, "step": 2070}, {"loss": 1.6904, "grad_norm": 0.4319164752960205, "learning_rate": 0.0002, "epoch": 2.7882037533512065, "step": 2080}, {"loss": 1.6531, "grad_norm": 0.47792428731918335, "learning_rate": 0.0002, "epoch": 2.8016085790884717, "step": 2090}, {"loss": 1.6417, "grad_norm": 0.5322234034538269, "learning_rate": 0.0002, "epoch": 2.8150134048257374, "step": 2100}, {"loss": 1.6634, "grad_norm": 0.47517943382263184, "learning_rate": 0.0002, "epoch": 2.8284182305630026, "step": 2110}, {"loss": 1.6329, "grad_norm": 0.45799025893211365, "learning_rate": 0.0002, "epoch": 2.841823056300268, "step": 2120}, {"loss": 1.6594, "grad_norm": 0.45852357149124146, "learning_rate": 0.0002, "epoch": 2.8552278820375334, "step": 2130}, {"loss": 1.61, "grad_norm": 0.4617408514022827, "learning_rate": 0.0002, "epoch": 2.868632707774799, "step": 2140}, {"loss": 1.6445, "grad_norm": 0.44205963611602783, "learning_rate": 0.0002, "epoch": 2.8820375335120643, "step": 2150}, {"loss": 1.6231, "grad_norm": 0.47173425555229187, "learning_rate": 0.0002, "epoch": 2.89544235924933, "step": 2160}, {"loss": 1.6425, "grad_norm": 0.46379899978637695, "learning_rate": 0.0002, "epoch": 2.908847184986595, "step": 2170}, {"loss": 1.6403, "grad_norm": 0.4999759793281555, "learning_rate": 0.0002, "epoch": 2.9222520107238603, "step": 2180}, {"loss": 1.6741, "grad_norm": 0.4607947766780853, "learning_rate": 0.0002, "epoch": 2.935656836461126, "step": 2190}, {"loss": 1.6889, "grad_norm": 0.4359836280345917, "learning_rate": 0.0002, "epoch": 2.9490616621983916, "step": 2200}, {"loss": 1.6478, "grad_norm": 0.5195549726486206, "learning_rate": 0.0002, "epoch": 2.962466487935657, "step": 2210}, {"loss": 1.6348, "grad_norm": 0.4914056062698364, "learning_rate": 0.0002, "epoch": 2.975871313672922, "step": 2220}, {"loss": 1.6594, "grad_norm": 0.4647377133369446, "learning_rate": 0.0002, "epoch": 2.9892761394101877, "step": 2230}, {"eval_loss": 1.8368606567382812, "eval_runtime": 90.5623, "eval_samples_per_second": 5.687, "eval_steps_per_second": 0.718, "epoch": 3.0, "step": 2238}, {"loss": 1.5704, "grad_norm": 0.40689945220947266, "learning_rate": 0.0002, "epoch": 3.002680965147453, "step": 2240}, {"loss": 1.5961, "grad_norm": 0.4699273705482483, "learning_rate": 0.0002, "epoch": 3.0160857908847185, "step": 2250}, {"loss": 1.5182, "grad_norm": 0.5531830787658691, "learning_rate": 0.0002, "epoch": 3.0294906166219837, "step": 2260}, {"loss": 1.4924, "grad_norm": 0.5441790223121643, "learning_rate": 0.0002, "epoch": 3.0428954423592494, "step": 2270}, {"loss": 1.4953, "grad_norm": 0.6145012974739075, "learning_rate": 0.0002, "epoch": 3.0563002680965146, "step": 2280}, {"loss": 1.4861, "grad_norm": 0.6997102499008179, "learning_rate": 0.0002, "epoch": 3.06970509383378, "step": 2290}, {"loss": 1.5853, "grad_norm": 0.6082330942153931, "learning_rate": 0.0002, "epoch": 3.0831099195710454, "step": 2300}, {"loss": 1.5377, "grad_norm": 0.5294155478477478, "learning_rate": 0.0002, "epoch": 3.096514745308311, "step": 2310}, {"loss": 1.5452, "grad_norm": 0.7200340032577515, "learning_rate": 0.0002, "epoch": 3.1099195710455763, "step": 2320}, {"loss": 1.5296, "grad_norm": 0.721092939376831, "learning_rate": 0.0002, "epoch": 3.123324396782842, "step": 2330}, {"loss": 1.5307, "grad_norm": 0.5344305038452148, "learning_rate": 0.0002, "epoch": 3.136729222520107, "step": 2340}, {"loss": 1.4347, "grad_norm": 0.5533145070075989, "learning_rate": 0.0002, "epoch": 3.1501340482573728, "step": 2350}, {"loss": 1.529, "grad_norm": 0.5976856350898743, "learning_rate": 0.0002, "epoch": 3.163538873994638, "step": 2360}, {"loss": 1.6044, "grad_norm": 0.4974960386753082, "learning_rate": 0.0002, "epoch": 3.1769436997319036, "step": 2370}, {"loss": 1.5554, "grad_norm": 0.6377840042114258, "learning_rate": 0.0002, "epoch": 3.190348525469169, "step": 2380}, {"loss": 1.5322, "grad_norm": 0.5447293519973755, "learning_rate": 0.0002, "epoch": 3.2037533512064345, "step": 2390}, {"loss": 1.5127, "grad_norm": 0.49577030539512634, "learning_rate": 0.0002, "epoch": 3.2171581769436997, "step": 2400}, {"loss": 1.4768, "grad_norm": 0.5588275790214539, "learning_rate": 0.0002, "epoch": 3.2305630026809653, "step": 2410}, {"loss": 1.4755, "grad_norm": 0.6429149508476257, "learning_rate": 0.0002, "epoch": 3.2439678284182305, "step": 2420}, {"loss": 1.5596, "grad_norm": 0.5713154673576355, "learning_rate": 0.0002, "epoch": 3.257372654155496, "step": 2430}, {"loss": 1.4763, "grad_norm": 0.6348955035209656, "learning_rate": 0.0002, "epoch": 3.2707774798927614, "step": 2440}, {"loss": 1.509, "grad_norm": 0.5675528645515442, "learning_rate": 0.0002, "epoch": 3.284182305630027, "step": 2450}, {"loss": 1.5867, "grad_norm": 0.5570188164710999, "learning_rate": 0.0002, "epoch": 3.297587131367292, "step": 2460}, {"loss": 1.554, "grad_norm": 0.6029602289199829, "learning_rate": 0.0002, "epoch": 3.310991957104558, "step": 2470}, {"loss": 1.5094, "grad_norm": 0.523206353187561, "learning_rate": 0.0002, "epoch": 3.324396782841823, "step": 2480}, {"loss": 1.4854, "grad_norm": 0.5912408828735352, "learning_rate": 0.0002, "epoch": 3.3378016085790883, "step": 2490}, {"loss": 1.5097, "grad_norm": 0.5524865984916687, "learning_rate": 0.0002, "epoch": 3.351206434316354, "step": 2500}, {"loss": 1.5064, "grad_norm": 0.60386061668396, "learning_rate": 0.0002, "epoch": 3.3646112600536195, "step": 2510}, {"loss": 1.564, "grad_norm": 0.5838595628738403, "learning_rate": 0.0002, "epoch": 3.3780160857908847, "step": 2520}, {"loss": 1.4615, "grad_norm": 0.5400974154472351, "learning_rate": 0.0002, "epoch": 3.39142091152815, "step": 2530}, {"loss": 1.5349, "grad_norm": 0.6150162220001221, "learning_rate": 0.0002, "epoch": 3.4048257372654156, "step": 2540}, {"loss": 1.5978, "grad_norm": 0.5279412269592285, "learning_rate": 0.0002, "epoch": 3.418230563002681, "step": 2550}, {"loss": 1.5063, "grad_norm": 0.5974063873291016, "learning_rate": 0.0002, "epoch": 3.4316353887399464, "step": 2560}, {"loss": 1.5825, "grad_norm": 0.661573052406311, "learning_rate": 0.0002, "epoch": 3.4450402144772116, "step": 2570}, {"loss": 1.5204, "grad_norm": 0.577880322933197, "learning_rate": 0.0002, "epoch": 3.4584450402144773, "step": 2580}, {"loss": 1.5295, "grad_norm": 0.5532318949699402, "learning_rate": 0.0002, "epoch": 3.4718498659517425, "step": 2590}, {"loss": 1.4933, "grad_norm": 0.5764921307563782, "learning_rate": 0.0002, "epoch": 3.485254691689008, "step": 2600}, {"loss": 1.4355, "grad_norm": 0.6145682334899902, "learning_rate": 0.0002, "epoch": 3.4986595174262733, "step": 2610}, {"loss": 1.4968, "grad_norm": 0.6561126112937927, "learning_rate": 0.0002, "epoch": 3.512064343163539, "step": 2620}, {"loss": 1.5309, "grad_norm": 0.5673288106918335, "learning_rate": 0.0002, "epoch": 3.525469168900804, "step": 2630}, {"loss": 1.5274, "grad_norm": 0.6215338706970215, "learning_rate": 0.0002, "epoch": 3.53887399463807, "step": 2640}, {"loss": 1.5117, "grad_norm": 0.5512040853500366, "learning_rate": 0.0002, "epoch": 3.552278820375335, "step": 2650}, {"loss": 1.5188, "grad_norm": 0.49503496289253235, "learning_rate": 0.0002, "epoch": 3.5656836461126007, "step": 2660}, {"loss": 1.524, "grad_norm": 0.5714912414550781, "learning_rate": 0.0002, "epoch": 3.579088471849866, "step": 2670}, {"loss": 1.4651, "grad_norm": 0.6883154511451721, "learning_rate": 0.0002, "epoch": 3.592493297587131, "step": 2680}, {"loss": 1.5174, "grad_norm": 0.5989556908607483, "learning_rate": 0.0002, "epoch": 3.6058981233243967, "step": 2690}, {"loss": 1.5335, "grad_norm": 0.630268394947052, "learning_rate": 0.0002, "epoch": 3.6193029490616624, "step": 2700}, {"loss": 1.4681, "grad_norm": 0.5819358229637146, "learning_rate": 0.0002, "epoch": 3.6327077747989276, "step": 2710}, {"loss": 1.5676, "grad_norm": 0.6102097034454346, "learning_rate": 0.0002, "epoch": 3.646112600536193, "step": 2720}, {"loss": 1.5566, "grad_norm": 0.6858501434326172, "learning_rate": 0.0002, "epoch": 3.6595174262734584, "step": 2730}, {"loss": 1.5242, "grad_norm": 0.6328608393669128, "learning_rate": 0.0002, "epoch": 3.672922252010724, "step": 2740}, {"loss": 1.5211, "grad_norm": 0.5366981029510498, "learning_rate": 0.0002, "epoch": 3.6863270777479893, "step": 2750}, {"loss": 1.5532, "grad_norm": 0.7048938274383545, "learning_rate": 0.0002, "epoch": 3.6997319034852545, "step": 2760}, {"loss": 1.5001, "grad_norm": 0.5371938347816467, "learning_rate": 0.0002, "epoch": 3.71313672922252, "step": 2770}, {"loss": 1.557, "grad_norm": 0.6142212152481079, "learning_rate": 0.0002, "epoch": 3.726541554959786, "step": 2780}, {"loss": 1.5191, "grad_norm": 0.6164522171020508, "learning_rate": 0.0002, "epoch": 3.739946380697051, "step": 2790}, {"loss": 1.5071, "grad_norm": 0.7511836886405945, "learning_rate": 0.0002, "epoch": 3.753351206434316, "step": 2800}, {"loss": 1.5775, "grad_norm": 0.6194717288017273, "learning_rate": 0.0002, "epoch": 3.766756032171582, "step": 2810}, {"loss": 1.5721, "grad_norm": 0.676721453666687, "learning_rate": 0.0002, "epoch": 3.780160857908847, "step": 2820}, {"loss": 1.502, "grad_norm": 0.5646911263465881, "learning_rate": 0.0002, "epoch": 3.7935656836461127, "step": 2830}, {"loss": 1.4871, "grad_norm": 0.5874826908111572, "learning_rate": 0.0002, "epoch": 3.806970509383378, "step": 2840}, {"loss": 1.5046, "grad_norm": 0.6395232677459717, "learning_rate": 0.0002, "epoch": 3.8203753351206435, "step": 2850}, {"loss": 1.5088, "grad_norm": 0.624563992023468, "learning_rate": 0.0002, "epoch": 3.8337801608579087, "step": 2860}, {"loss": 1.479, "grad_norm": 0.59019935131073, "learning_rate": 0.0002, "epoch": 3.8471849865951744, "step": 2870}, {"loss": 1.4693, "grad_norm": 0.6700479984283447, "learning_rate": 0.0002, "epoch": 3.8605898123324396, "step": 2880}, {"loss": 1.5032, "grad_norm": 0.6131282448768616, "learning_rate": 0.0002, "epoch": 3.8739946380697052, "step": 2890}, {"loss": 1.5446, "grad_norm": 0.6807777881622314, "learning_rate": 0.0002, "epoch": 3.8873994638069704, "step": 2900}, {"loss": 1.5618, "grad_norm": 0.5297217965126038, "learning_rate": 0.0002, "epoch": 3.900804289544236, "step": 2910}, {"loss": 1.5046, "grad_norm": 0.5795540809631348, "learning_rate": 0.0002, "epoch": 3.9142091152815013, "step": 2920}, {"loss": 1.5155, "grad_norm": 0.5549747347831726, "learning_rate": 0.0002, "epoch": 3.927613941018767, "step": 2930}, {"loss": 1.5932, "grad_norm": 0.5895092487335205, "learning_rate": 0.0002, "epoch": 3.941018766756032, "step": 2940}, {"loss": 1.5831, "grad_norm": 0.590002715587616, "learning_rate": 0.0002, "epoch": 3.9544235924932973, "step": 2950}, {"loss": 1.592, "grad_norm": 0.7847695350646973, "learning_rate": 0.0002, "epoch": 3.967828418230563, "step": 2960}, {"loss": 1.4892, "grad_norm": 0.5845848321914673, "learning_rate": 0.0002, "epoch": 3.9812332439678286, "step": 2970}, {"loss": 1.5094, "grad_norm": 0.5861571431159973, "learning_rate": 0.0002, "epoch": 3.994638069705094, "step": 2980}, {"eval_loss": 1.8821998834609985, "eval_runtime": 90.8701, "eval_samples_per_second": 5.667, "eval_steps_per_second": 0.715, "epoch": 4.0, "step": 2984}, {"loss": 1.4156, "grad_norm": 0.6209918260574341, "learning_rate": 0.0002, "epoch": 4.008042895442359, "step": 2990}, {"loss": 1.4244, "grad_norm": 0.607226550579071, "learning_rate": 0.0002, "epoch": 4.021447721179625, "step": 3000}, {"loss": 1.3652, "grad_norm": 0.6677961349487305, "learning_rate": 0.0002, "epoch": 4.03485254691689, "step": 3010}, {"loss": 1.3815, "grad_norm": 0.9053248763084412, "learning_rate": 0.0002, "epoch": 4.048257372654155, "step": 3020}, {"loss": 1.4346, "grad_norm": 0.6815084218978882, "learning_rate": 0.0002, "epoch": 4.061662198391421, "step": 3030}, {"loss": 1.3, "grad_norm": 0.6709407567977905, "learning_rate": 0.0002, "epoch": 4.075067024128686, "step": 3040}, {"loss": 1.3406, "grad_norm": 0.728184163570404, "learning_rate": 0.0002, "epoch": 4.088471849865952, "step": 3050}, {"loss": 1.3404, "grad_norm": 0.817628800868988, "learning_rate": 0.0002, "epoch": 4.101876675603217, "step": 3060}, {"loss": 1.3496, "grad_norm": 0.7384206056594849, "learning_rate": 0.0002, "epoch": 4.115281501340482, "step": 3070}, {"loss": 1.3621, "grad_norm": 0.7380280494689941, "learning_rate": 0.0002, "epoch": 4.128686327077748, "step": 3080}, {"loss": 1.3425, "grad_norm": 0.8197277188301086, "learning_rate": 0.0002, "epoch": 4.142091152815014, "step": 3090}, {"loss": 1.3761, "grad_norm": 0.8971617817878723, "learning_rate": 0.0002, "epoch": 4.1554959785522785, "step": 3100}, {"loss": 1.3564, "grad_norm": 0.7409387826919556, "learning_rate": 0.0002, "epoch": 4.168900804289544, "step": 3110}, {"loss": 1.3675, "grad_norm": 0.6948909163475037, "learning_rate": 0.0002, "epoch": 4.18230563002681, "step": 3120}, {"loss": 1.3397, "grad_norm": 0.7619595527648926, "learning_rate": 0.0002, "epoch": 4.195710455764075, "step": 3130}, {"loss": 1.3864, "grad_norm": 0.7657106518745422, "learning_rate": 0.0002, "epoch": 4.20911528150134, "step": 3140}, {"loss": 1.4017, "grad_norm": 0.6919401288032532, "learning_rate": 0.0002, "epoch": 4.222520107238606, "step": 3150}, {"loss": 1.3692, "grad_norm": 0.6991415023803711, "learning_rate": 0.0002, "epoch": 4.2359249329758715, "step": 3160}, {"loss": 1.3651, "grad_norm": 0.7349252700805664, "learning_rate": 0.0002, "epoch": 4.249329758713137, "step": 3170}, {"loss": 1.367, "grad_norm": 0.8838240504264832, "learning_rate": 0.0002, "epoch": 4.262734584450402, "step": 3180}, {"loss": 1.4254, "grad_norm": 0.7240107655525208, "learning_rate": 0.0002, "epoch": 4.2761394101876675, "step": 3190}, {"loss": 1.3671, "grad_norm": 0.7338636517524719, "learning_rate": 0.0002, "epoch": 4.289544235924933, "step": 3200}, {"loss": 1.448, "grad_norm": 0.7891436815261841, "learning_rate": 0.0002, "epoch": 4.302949061662199, "step": 3210}, {"loss": 1.3291, "grad_norm": 0.7407845854759216, "learning_rate": 0.0002, "epoch": 4.316353887399464, "step": 3220}, {"loss": 1.3899, "grad_norm": 0.7635948061943054, "learning_rate": 0.0002, "epoch": 4.329758713136729, "step": 3230}, {"loss": 1.3384, "grad_norm": 0.7478461861610413, "learning_rate": 0.0002, "epoch": 4.343163538873995, "step": 3240}, {"loss": 1.388, "grad_norm": 0.7684298157691956, "learning_rate": 0.0002, "epoch": 4.35656836461126, "step": 3250}, {"loss": 1.4233, "grad_norm": 1.0287525653839111, "learning_rate": 0.0002, "epoch": 4.369973190348525, "step": 3260}, {"loss": 1.3542, "grad_norm": 0.750616192817688, "learning_rate": 0.0002, "epoch": 4.383378016085791, "step": 3270}, {"loss": 1.3158, "grad_norm": 0.7911648750305176, "learning_rate": 0.0002, "epoch": 4.396782841823057, "step": 3280}, {"loss": 1.3896, "grad_norm": 0.9156750440597534, "learning_rate": 0.0002, "epoch": 4.410187667560321, "step": 3290}, {"loss": 1.3887, "grad_norm": 1.0180249214172363, "learning_rate": 0.0002, "epoch": 4.423592493297587, "step": 3300}, {"loss": 1.4143, "grad_norm": 1.0792218446731567, "learning_rate": 0.0002, "epoch": 4.436997319034853, "step": 3310}, {"loss": 1.3314, "grad_norm": 0.8027488589286804, "learning_rate": 0.0002, "epoch": 4.450402144772118, "step": 3320}, {"loss": 1.4144, "grad_norm": 0.8037815093994141, "learning_rate": 0.0002, "epoch": 4.463806970509383, "step": 3330}, {"loss": 1.4124, "grad_norm": 0.7907946705818176, "learning_rate": 0.0002, "epoch": 4.477211796246649, "step": 3340}, {"loss": 1.443, "grad_norm": 0.7206302881240845, "learning_rate": 0.0002, "epoch": 4.490616621983914, "step": 3350}, {"loss": 1.3822, "grad_norm": 0.7697674632072449, "learning_rate": 0.0002, "epoch": 4.50402144772118, "step": 3360}, {"loss": 1.3923, "grad_norm": 0.7315130829811096, "learning_rate": 0.0002, "epoch": 4.517426273458445, "step": 3370}, {"loss": 1.3598, "grad_norm": 0.7896273136138916, "learning_rate": 0.0002, "epoch": 4.53083109919571, "step": 3380}, {"loss": 1.3947, "grad_norm": 0.7720345258712769, "learning_rate": 0.0002, "epoch": 4.544235924932976, "step": 3390}, {"loss": 1.404, "grad_norm": 0.8304631114006042, "learning_rate": 0.0002, "epoch": 4.557640750670242, "step": 3400}, {"loss": 1.3712, "grad_norm": 0.7408214211463928, "learning_rate": 0.0002, "epoch": 4.571045576407506, "step": 3410}, {"loss": 1.3957, "grad_norm": 0.8100157976150513, "learning_rate": 0.0002, "epoch": 4.584450402144772, "step": 3420}, {"loss": 1.47, "grad_norm": 0.7829574942588806, "learning_rate": 0.0002, "epoch": 4.597855227882038, "step": 3430}, {"loss": 1.3684, "grad_norm": 0.9529728889465332, "learning_rate": 0.0002, "epoch": 4.6112600536193025, "step": 3440}, {"loss": 1.3984, "grad_norm": 1.0769460201263428, "learning_rate": 0.0002, "epoch": 4.624664879356568, "step": 3450}, {"loss": 1.4063, "grad_norm": 0.8941947817802429, "learning_rate": 0.0002, "epoch": 4.638069705093834, "step": 3460}, {"loss": 1.4421, "grad_norm": 0.7860096096992493, "learning_rate": 0.0002, "epoch": 4.651474530831099, "step": 3470}, {"loss": 1.3782, "grad_norm": 0.8184044361114502, "learning_rate": 0.0002, "epoch": 4.664879356568365, "step": 3480}, {"loss": 1.3885, "grad_norm": 0.7852717638015747, "learning_rate": 0.0002, "epoch": 4.67828418230563, "step": 3490}, {"loss": 1.4139, "grad_norm": 0.750586986541748, "learning_rate": 0.0002, "epoch": 4.6916890080428955, "step": 3500}, {"loss": 1.3224, "grad_norm": 0.7966068983078003, "learning_rate": 0.0002, "epoch": 4.705093833780161, "step": 3510}, {"loss": 1.4052, "grad_norm": 0.8387030959129333, "learning_rate": 0.0002, "epoch": 4.718498659517426, "step": 3520}, {"loss": 1.4541, "grad_norm": 0.7373180389404297, "learning_rate": 0.0002, "epoch": 4.7319034852546915, "step": 3530}, {"loss": 1.4148, "grad_norm": 0.8415353894233704, "learning_rate": 0.0002, "epoch": 4.745308310991957, "step": 3540}, {"loss": 1.4236, "grad_norm": 0.7155488133430481, "learning_rate": 0.0002, "epoch": 4.758713136729223, "step": 3550}, {"loss": 1.3454, "grad_norm": 0.697658896446228, "learning_rate": 0.0002, "epoch": 4.772117962466488, "step": 3560}, {"loss": 1.4002, "grad_norm": 0.8722999095916748, "learning_rate": 0.0002, "epoch": 4.785522788203753, "step": 3570}, {"loss": 1.4224, "grad_norm": 0.8106381297111511, "learning_rate": 0.0002, "epoch": 4.798927613941019, "step": 3580}, {"loss": 1.3525, "grad_norm": 0.9320500493049622, "learning_rate": 0.0002, "epoch": 4.8123324396782845, "step": 3590}, {"loss": 1.3675, "grad_norm": 0.7583016157150269, "learning_rate": 0.0002, "epoch": 4.825737265415549, "step": 3600}, {"loss": 1.3761, "grad_norm": 0.790050208568573, "learning_rate": 0.0002, "epoch": 4.839142091152815, "step": 3610}, {"loss": 1.4144, "grad_norm": 0.7481580972671509, "learning_rate": 0.0002, "epoch": 4.8525469168900806, "step": 3620}, {"loss": 1.4424, "grad_norm": 0.8709374666213989, "learning_rate": 0.0002, "epoch": 4.865951742627346, "step": 3630}, {"loss": 1.3758, "grad_norm": 0.7266733050346375, "learning_rate": 0.0002, "epoch": 4.879356568364611, "step": 3640}, {"loss": 1.4254, "grad_norm": 0.7669504880905151, "learning_rate": 0.0002, "epoch": 4.892761394101877, "step": 3650}, {"loss": 1.3956, "grad_norm": 0.7855764627456665, "learning_rate": 0.0002, "epoch": 4.906166219839142, "step": 3660}, {"loss": 1.4609, "grad_norm": 0.8145440816879272, "learning_rate": 0.0002, "epoch": 4.919571045576408, "step": 3670}, {"loss": 1.4152, "grad_norm": 0.7487278580665588, "learning_rate": 0.0002, "epoch": 4.932975871313673, "step": 3680}, {"loss": 1.4386, "grad_norm": 0.8390981554985046, "learning_rate": 0.0002, "epoch": 4.946380697050938, "step": 3690}, {"loss": 1.3504, "grad_norm": 0.663752555847168, "learning_rate": 0.0002, "epoch": 4.959785522788204, "step": 3700}, {"loss": 1.3453, "grad_norm": 0.7821969985961914, "learning_rate": 0.0002, "epoch": 4.973190348525469, "step": 3710}, {"loss": 1.3936, "grad_norm": 0.9157266020774841, "learning_rate": 0.0002, "epoch": 4.986595174262734, "step": 3720}, {"loss": 1.3925, "grad_norm": 0.7683535814285278, "learning_rate": 0.0002, "epoch": 5.0, "step": 3730}, {"eval_loss": 1.9639414548873901, "eval_runtime": 92.0173, "eval_samples_per_second": 5.597, "eval_steps_per_second": 0.706, "epoch": 5.0, "step": 3730}, {"loss": 1.1852, "grad_norm": 1.3000373840332031, "learning_rate": 0.0002, "epoch": 5.013404825737266, "step": 3740}, {"loss": 1.1922, "grad_norm": 0.8916982412338257, "learning_rate": 0.0002, "epoch": 5.02680965147453, "step": 3750}, {"loss": 1.2113, "grad_norm": 1.0365116596221924, "learning_rate": 0.0002, "epoch": 5.040214477211796, "step": 3760}, {"loss": 1.2941, "grad_norm": 0.999420166015625, "learning_rate": 0.0002, "epoch": 5.053619302949062, "step": 3770}, {"loss": 1.24, "grad_norm": 1.093572974205017, "learning_rate": 0.0002, "epoch": 5.067024128686327, "step": 3780}, {"loss": 1.2345, "grad_norm": 1.1137515306472778, "learning_rate": 0.0002, "epoch": 5.080428954423592, "step": 3790}, {"loss": 1.1646, "grad_norm": 1.0328283309936523, "learning_rate": 0.0002, "epoch": 5.093833780160858, "step": 3800}, {"loss": 1.1716, "grad_norm": 1.0444108247756958, "learning_rate": 0.0002, "epoch": 5.107238605898123, "step": 3810}, {"loss": 1.2226, "grad_norm": 0.858148992061615, "learning_rate": 0.0002, "epoch": 5.120643431635389, "step": 3820}, {"loss": 1.1691, "grad_norm": 0.94026780128479, "learning_rate": 0.0002, "epoch": 5.134048257372654, "step": 3830}, {"loss": 1.1902, "grad_norm": 0.8987152576446533, "learning_rate": 0.0002, "epoch": 5.1474530831099194, "step": 3840}, {"loss": 1.1562, "grad_norm": 0.922997236251831, "learning_rate": 0.0002, "epoch": 5.160857908847185, "step": 3850}, {"loss": 1.2072, "grad_norm": 0.9172422289848328, "learning_rate": 0.0002, "epoch": 5.174262734584451, "step": 3860}, {"loss": 1.1802, "grad_norm": 1.02277672290802, "learning_rate": 0.0002, "epoch": 5.1876675603217155, "step": 3870}, {"loss": 1.2206, "grad_norm": 1.093826413154602, "learning_rate": 0.0002, "epoch": 5.201072386058981, "step": 3880}, {"loss": 1.2578, "grad_norm": 0.9362447261810303, "learning_rate": 0.0002, "epoch": 5.214477211796247, "step": 3890}, {"loss": 1.2335, "grad_norm": 1.0564044713974, "learning_rate": 0.0002, "epoch": 5.227882037533512, "step": 3900}, {"loss": 1.1936, "grad_norm": 0.869575023651123, "learning_rate": 0.0002, "epoch": 5.241286863270777, "step": 3910}, {"loss": 1.2301, "grad_norm": 1.0383203029632568, "learning_rate": 0.0002, "epoch": 5.254691689008043, "step": 3920}, {"loss": 1.2076, "grad_norm": 0.9146919846534729, "learning_rate": 0.0002, "epoch": 5.2680965147453085, "step": 3930}, {"loss": 1.2804, "grad_norm": 0.9226430654525757, "learning_rate": 0.0002, "epoch": 5.281501340482574, "step": 3940}, {"loss": 1.2506, "grad_norm": 0.8703194260597229, "learning_rate": 0.0002, "epoch": 5.294906166219839, "step": 3950}, {"loss": 1.2533, "grad_norm": 1.0588284730911255, "learning_rate": 0.0002, "epoch": 5.3083109919571045, "step": 3960}, {"loss": 1.2405, "grad_norm": 1.1131688356399536, "learning_rate": 0.0002, "epoch": 5.32171581769437, "step": 3970}, {"loss": 1.1719, "grad_norm": 1.1073139905929565, "learning_rate": 0.0002, "epoch": 5.335120643431635, "step": 3980}, {"loss": 1.2375, "grad_norm": 0.9269049763679504, "learning_rate": 0.0002, "epoch": 5.348525469168901, "step": 3990}, {"loss": 1.2513, "grad_norm": 0.9802212715148926, "learning_rate": 0.0002, "epoch": 5.361930294906166, "step": 4000}, {"loss": 1.1573, "grad_norm": 0.9152148365974426, "learning_rate": 0.0002, "epoch": 5.375335120643432, "step": 4010}, {"loss": 1.2673, "grad_norm": 1.0395890474319458, "learning_rate": 0.0002, "epoch": 5.388739946380697, "step": 4020}, {"loss": 1.2228, "grad_norm": 1.0989106893539429, "learning_rate": 0.0002, "epoch": 5.402144772117962, "step": 4030}, {"loss": 1.2717, "grad_norm": 1.0305225849151611, "learning_rate": 0.0002, "epoch": 5.415549597855228, "step": 4040}, {"loss": 1.2751, "grad_norm": 0.8416915535926819, "learning_rate": 0.0002, "epoch": 5.428954423592494, "step": 4050}, {"loss": 1.2205, "grad_norm": 0.9120758175849915, "learning_rate": 0.0002, "epoch": 5.442359249329758, "step": 4060}, {"loss": 1.2812, "grad_norm": 1.197936773300171, "learning_rate": 0.0002, "epoch": 5.455764075067024, "step": 4070}, {"loss": 1.2346, "grad_norm": 1.0116125345230103, "learning_rate": 0.0002, "epoch": 5.46916890080429, "step": 4080}, {"loss": 1.1746, "grad_norm": 1.048995018005371, "learning_rate": 0.0002, "epoch": 5.482573726541555, "step": 4090}, {"loss": 1.1858, "grad_norm": 0.929185152053833, "learning_rate": 0.0002, "epoch": 5.49597855227882, "step": 4100}, {"loss": 1.3068, "grad_norm": 0.9064884781837463, "learning_rate": 0.0002, "epoch": 5.509383378016086, "step": 4110}, {"loss": 1.2481, "grad_norm": 1.2009892463684082, "learning_rate": 0.0002, "epoch": 5.522788203753351, "step": 4120}, {"loss": 1.2788, "grad_norm": 0.9054455161094666, "learning_rate": 0.0002, "epoch": 5.536193029490617, "step": 4130}, {"loss": 1.1624, "grad_norm": 0.9978497624397278, "learning_rate": 0.0002, "epoch": 5.549597855227882, "step": 4140}, {"loss": 1.2814, "grad_norm": 0.9779615998268127, "learning_rate": 0.0002, "epoch": 5.563002680965147, "step": 4150}, {"loss": 1.2361, "grad_norm": 1.0515185594558716, "learning_rate": 0.0002, "epoch": 5.576407506702413, "step": 4160}, {"loss": 1.2278, "grad_norm": 0.8618236184120178, "learning_rate": 0.0002, "epoch": 5.589812332439678, "step": 4170}, {"loss": 1.2853, "grad_norm": 0.9569384455680847, "learning_rate": 0.0002, "epoch": 5.603217158176943, "step": 4180}, {"loss": 1.2824, "grad_norm": 0.968923807144165, "learning_rate": 0.0002, "epoch": 5.616621983914209, "step": 4190}, {"loss": 1.3055, "grad_norm": 0.8759993314743042, "learning_rate": 0.0002, "epoch": 5.630026809651475, "step": 4200}, {"loss": 1.2912, "grad_norm": 0.9284833669662476, "learning_rate": 0.0002, "epoch": 5.64343163538874, "step": 4210}, {"loss": 1.2886, "grad_norm": 0.9293071031570435, "learning_rate": 0.0002, "epoch": 5.656836461126005, "step": 4220}, {"loss": 1.2704, "grad_norm": 0.9872161149978638, "learning_rate": 0.0002, "epoch": 5.670241286863271, "step": 4230}, {"loss": 1.2525, "grad_norm": 0.9545941948890686, "learning_rate": 0.0002, "epoch": 5.683646112600536, "step": 4240}, {"loss": 1.2639, "grad_norm": 1.0202341079711914, "learning_rate": 0.0002, "epoch": 5.697050938337801, "step": 4250}, {"loss": 1.2259, "grad_norm": 0.9821504950523376, "learning_rate": 0.0002, "epoch": 5.710455764075067, "step": 4260}, {"loss": 1.2243, "grad_norm": 1.0581456422805786, "learning_rate": 0.0002, "epoch": 5.7238605898123325, "step": 4270}, {"loss": 1.227, "grad_norm": 0.9639395475387573, "learning_rate": 0.0002, "epoch": 5.737265415549598, "step": 4280}, {"loss": 1.2849, "grad_norm": 2.205458164215088, "learning_rate": 0.0002, "epoch": 5.750670241286863, "step": 4290}, {"loss": 1.2785, "grad_norm": 1.0294393301010132, "learning_rate": 0.0002, "epoch": 5.7640750670241285, "step": 4300}, {"loss": 1.261, "grad_norm": 1.0360256433486938, "learning_rate": 0.0002, "epoch": 5.777479892761394, "step": 4310}, {"loss": 1.2891, "grad_norm": 0.9390154480934143, "learning_rate": 0.0002, "epoch": 5.79088471849866, "step": 4320}, {"loss": 1.248, "grad_norm": 0.9048963189125061, "learning_rate": 0.0002, "epoch": 5.804289544235925, "step": 4330}, {"loss": 1.2753, "grad_norm": 0.9310713410377502, "learning_rate": 0.0002, "epoch": 5.81769436997319, "step": 4340}, {"loss": 1.2393, "grad_norm": 1.038282871246338, "learning_rate": 0.0002, "epoch": 5.831099195710456, "step": 4350}, {"loss": 1.3398, "grad_norm": 0.9194827079772949, "learning_rate": 0.0002, "epoch": 5.8445040214477215, "step": 4360}, {"loss": 1.3049, "grad_norm": 0.9568411111831665, "learning_rate": 0.0002, "epoch": 5.857908847184986, "step": 4370}, {"loss": 1.2899, "grad_norm": 0.9088910818099976, "learning_rate": 0.0002, "epoch": 5.871313672922252, "step": 4380}, {"loss": 1.2497, "grad_norm": 1.0605647563934326, "learning_rate": 0.0002, "epoch": 5.884718498659518, "step": 4390}, {"loss": 1.2387, "grad_norm": 0.8016388416290283, "learning_rate": 0.0002, "epoch": 5.898123324396783, "step": 4400}, {"loss": 1.3046, "grad_norm": 1.0792853832244873, "learning_rate": 0.0002, "epoch": 5.911528150134048, "step": 4410}, {"loss": 1.282, "grad_norm": 1.059403657913208, "learning_rate": 0.0002, "epoch": 5.924932975871314, "step": 4420}, {"loss": 1.2524, "grad_norm": 0.87492436170578, "learning_rate": 0.0002, "epoch": 5.938337801608579, "step": 4430}, {"loss": 1.2373, "grad_norm": 1.0911097526550293, "learning_rate": 0.0002, "epoch": 5.951742627345844, "step": 4440}, {"loss": 1.3073, "grad_norm": 0.8860997557640076, "learning_rate": 0.0002, "epoch": 5.96514745308311, "step": 4450}, {"loss": 1.3273, "grad_norm": 0.9176826477050781, "learning_rate": 0.0002, "epoch": 5.978552278820375, "step": 4460}, {"loss": 1.2725, "grad_norm": 0.9018680453300476, "learning_rate": 0.0002, "epoch": 5.991957104557641, "step": 4470}]} +{"epoch": 7.0, "step": 5222, "epoch_duration": 2043.2659237384796, "total_accumulated_duration": 14342.760690689087, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 14256.0}, "peak_memory_reserved": {"GPU_0": 15414.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}, {"eval_loss": 1.8168668746948242, "eval_runtime": 90.6336, "eval_samples_per_second": 5.682, "eval_steps_per_second": 0.717, "epoch": 1.0, "step": 746}, {"loss": 1.7741, "grad_norm": 0.2998153269290924, "learning_rate": 0.0002, "epoch": 1.0053619302949062, "step": 750}, {"loss": 1.7897, "grad_norm": 0.34353747963905334, "learning_rate": 0.0002, "epoch": 1.0187667560321716, "step": 760}, {"loss": 1.6997, "grad_norm": 0.3506847321987152, "learning_rate": 0.0002, "epoch": 1.032171581769437, "step": 770}, {"loss": 1.7277, "grad_norm": 0.3434218764305115, "learning_rate": 0.0002, "epoch": 1.0455764075067024, "step": 780}, {"loss": 1.7201, "grad_norm": 0.39283573627471924, "learning_rate": 0.0002, "epoch": 1.0589812332439679, "step": 790}, {"loss": 1.7134, "grad_norm": 0.36534103751182556, "learning_rate": 0.0002, "epoch": 1.0723860589812333, "step": 800}, {"loss": 1.73, "grad_norm": 0.32713210582733154, "learning_rate": 0.0002, "epoch": 1.0857908847184987, "step": 810}, {"loss": 1.733, "grad_norm": 0.4298870861530304, "learning_rate": 0.0002, "epoch": 1.0991957104557641, "step": 820}, {"loss": 1.7152, "grad_norm": 0.3652895987033844, "learning_rate": 0.0002, "epoch": 1.1126005361930296, "step": 830}, {"loss": 1.7952, "grad_norm": 0.4341593086719513, "learning_rate": 0.0002, "epoch": 1.126005361930295, "step": 840}, {"loss": 1.7353, "grad_norm": 0.3925093412399292, "learning_rate": 0.0002, "epoch": 1.1394101876675604, "step": 850}, {"loss": 1.7484, "grad_norm": 0.3695056736469269, "learning_rate": 0.0002, "epoch": 1.1528150134048256, "step": 860}, {"loss": 1.7959, "grad_norm": 0.36138468980789185, "learning_rate": 0.0002, "epoch": 1.1662198391420913, "step": 870}, {"loss": 1.7144, "grad_norm": 0.33074072003364563, "learning_rate": 0.0002, "epoch": 1.1796246648793565, "step": 880}, {"loss": 1.7303, "grad_norm": 0.3552579879760742, "learning_rate": 0.0002, "epoch": 1.193029490616622, "step": 890}, {"loss": 1.6857, "grad_norm": 0.38744238018989563, "learning_rate": 0.0002, "epoch": 1.2064343163538873, "step": 900}, {"loss": 1.7543, "grad_norm": 0.3563305735588074, "learning_rate": 0.0002, "epoch": 1.2198391420911527, "step": 910}, {"loss": 1.7406, "grad_norm": 0.35686084628105164, "learning_rate": 0.0002, "epoch": 1.2332439678284182, "step": 920}, {"loss": 1.765, "grad_norm": 0.4001927077770233, "learning_rate": 0.0002, "epoch": 1.2466487935656836, "step": 930}, {"loss": 1.7147, "grad_norm": 0.35909149050712585, "learning_rate": 0.0002, "epoch": 1.260053619302949, "step": 940}, {"loss": 1.6712, "grad_norm": 0.35123375058174133, "learning_rate": 0.0002, "epoch": 1.2734584450402144, "step": 950}, {"loss": 1.7245, "grad_norm": 0.38013333082199097, "learning_rate": 0.0002, "epoch": 1.2868632707774799, "step": 960}, {"loss": 1.7395, "grad_norm": 0.373146653175354, "learning_rate": 0.0002, "epoch": 1.3002680965147453, "step": 970}, {"loss": 1.707, "grad_norm": 0.4208183288574219, "learning_rate": 0.0002, "epoch": 1.3136729222520107, "step": 980}, {"loss": 1.7122, "grad_norm": 0.3613564074039459, "learning_rate": 0.0002, "epoch": 1.3270777479892761, "step": 990}, {"loss": 1.6776, "grad_norm": 0.34058499336242676, "learning_rate": 0.0002, "epoch": 1.3404825737265416, "step": 1000}, {"loss": 1.7072, "grad_norm": 0.3563075065612793, "learning_rate": 0.0002, "epoch": 1.353887399463807, "step": 1010}, {"loss": 1.7167, "grad_norm": 0.36920854449272156, "learning_rate": 0.0002, "epoch": 1.3672922252010724, "step": 1020}, {"loss": 1.7143, "grad_norm": 0.3889519274234772, "learning_rate": 0.0002, "epoch": 1.3806970509383378, "step": 1030}, {"loss": 1.8023, "grad_norm": 0.3664555251598358, "learning_rate": 0.0002, "epoch": 1.3941018766756033, "step": 1040}, {"loss": 1.7961, "grad_norm": 0.38175567984580994, "learning_rate": 0.0002, "epoch": 1.4075067024128687, "step": 1050}, {"loss": 1.7363, "grad_norm": 0.42346763610839844, "learning_rate": 0.0002, "epoch": 1.420911528150134, "step": 1060}, {"loss": 1.708, "grad_norm": 0.3456033170223236, "learning_rate": 0.0002, "epoch": 1.4343163538873995, "step": 1070}, {"loss": 1.6846, "grad_norm": 0.38931941986083984, "learning_rate": 0.0002, "epoch": 1.447721179624665, "step": 1080}, {"loss": 1.7416, "grad_norm": 0.5473279356956482, "learning_rate": 0.0002, "epoch": 1.4611260053619302, "step": 1090}, {"loss": 1.6927, "grad_norm": 0.3517422676086426, "learning_rate": 0.0002, "epoch": 1.4745308310991958, "step": 1100}, {"loss": 1.7213, "grad_norm": 0.3511943221092224, "learning_rate": 0.0002, "epoch": 1.487935656836461, "step": 1110}, {"loss": 1.7947, "grad_norm": 0.3762837052345276, "learning_rate": 0.0002, "epoch": 1.5013404825737267, "step": 1120}, {"loss": 1.6893, "grad_norm": 0.37149128317832947, "learning_rate": 0.0002, "epoch": 1.5147453083109919, "step": 1130}, {"loss": 1.6944, "grad_norm": 0.3945842981338501, "learning_rate": 0.0002, "epoch": 1.5281501340482575, "step": 1140}, {"loss": 1.7254, "grad_norm": 0.40258195996284485, "learning_rate": 0.0002, "epoch": 1.5415549597855227, "step": 1150}, {"loss": 1.6798, "grad_norm": 0.3959120213985443, "learning_rate": 0.0002, "epoch": 1.5549597855227884, "step": 1160}, {"loss": 1.7789, "grad_norm": 0.37792712450027466, "learning_rate": 0.0002, "epoch": 1.5683646112600536, "step": 1170}, {"loss": 1.7953, "grad_norm": 0.4019201099872589, "learning_rate": 0.0002, "epoch": 1.5817694369973192, "step": 1180}, {"loss": 1.6887, "grad_norm": 0.40712273120880127, "learning_rate": 0.0002, "epoch": 1.5951742627345844, "step": 1190}, {"loss": 1.7131, "grad_norm": 0.4131423234939575, "learning_rate": 0.0002, "epoch": 1.6085790884718498, "step": 1200}, {"loss": 1.6757, "grad_norm": 0.3738194704055786, "learning_rate": 0.0002, "epoch": 1.6219839142091153, "step": 1210}, {"loss": 1.7629, "grad_norm": 0.3987765908241272, "learning_rate": 0.0002, "epoch": 1.6353887399463807, "step": 1220}, {"loss": 1.7374, "grad_norm": 0.34117406606674194, "learning_rate": 0.0002, "epoch": 1.648793565683646, "step": 1230}, {"loss": 1.7869, "grad_norm": 0.34900516271591187, "learning_rate": 0.0002, "epoch": 1.6621983914209115, "step": 1240}, {"loss": 1.7162, "grad_norm": 0.35759788751602173, "learning_rate": 0.0002, "epoch": 1.675603217158177, "step": 1250}, {"loss": 1.7697, "grad_norm": 0.3837822377681732, "learning_rate": 0.0002, "epoch": 1.6890080428954424, "step": 1260}, {"loss": 1.7972, "grad_norm": 0.3671180307865143, "learning_rate": 0.0002, "epoch": 1.7024128686327078, "step": 1270}, {"loss": 1.7198, "grad_norm": 0.4124658703804016, "learning_rate": 0.0002, "epoch": 1.7158176943699732, "step": 1280}, {"loss": 1.8006, "grad_norm": 0.39059901237487793, "learning_rate": 0.0002, "epoch": 1.7292225201072386, "step": 1290}, {"loss": 1.7721, "grad_norm": 0.4006287157535553, "learning_rate": 0.0002, "epoch": 1.742627345844504, "step": 1300}, {"loss": 1.8196, "grad_norm": 0.3606216013431549, "learning_rate": 0.0002, "epoch": 1.7560321715817695, "step": 1310}, {"loss": 1.7213, "grad_norm": 0.3861924111843109, "learning_rate": 0.0002, "epoch": 1.7694369973190347, "step": 1320}, {"loss": 1.7849, "grad_norm": 0.41432589292526245, "learning_rate": 0.0002, "epoch": 1.7828418230563003, "step": 1330}, {"loss": 1.7069, "grad_norm": 0.3751705586910248, "learning_rate": 0.0002, "epoch": 1.7962466487935655, "step": 1340}, {"loss": 1.717, "grad_norm": 0.36217355728149414, "learning_rate": 0.0002, "epoch": 1.8096514745308312, "step": 1350}, {"loss": 1.7878, "grad_norm": 0.35937434434890747, "learning_rate": 0.0002, "epoch": 1.8230563002680964, "step": 1360}, {"loss": 1.7026, "grad_norm": 0.36120304465293884, "learning_rate": 0.0002, "epoch": 1.836461126005362, "step": 1370}, {"loss": 1.7378, "grad_norm": 0.36082401871681213, "learning_rate": 0.0002, "epoch": 1.8498659517426272, "step": 1380}, {"loss": 1.6938, "grad_norm": 0.3616413176059723, "learning_rate": 0.0002, "epoch": 1.863270777479893, "step": 1390}, {"loss": 1.6998, "grad_norm": 0.3664911091327667, "learning_rate": 0.0002, "epoch": 1.876675603217158, "step": 1400}, {"loss": 1.7548, "grad_norm": 0.3545122444629669, "learning_rate": 0.0002, "epoch": 1.8900804289544237, "step": 1410}, {"loss": 1.727, "grad_norm": 0.38186976313591003, "learning_rate": 0.0002, "epoch": 1.903485254691689, "step": 1420}, {"loss": 1.788, "grad_norm": 0.41099944710731506, "learning_rate": 0.0002, "epoch": 1.9168900804289544, "step": 1430}, {"loss": 1.7377, "grad_norm": 0.34538620710372925, "learning_rate": 0.0002, "epoch": 1.9302949061662198, "step": 1440}, {"loss": 1.7349, "grad_norm": 0.35443663597106934, "learning_rate": 0.0002, "epoch": 1.9436997319034852, "step": 1450}, {"loss": 1.7457, "grad_norm": 0.4783519208431244, "learning_rate": 0.0002, "epoch": 1.9571045576407506, "step": 1460}, {"loss": 1.7073, "grad_norm": 0.36285310983657837, "learning_rate": 0.0002, "epoch": 1.970509383378016, "step": 1470}, {"loss": 1.7607, "grad_norm": 0.361730694770813, "learning_rate": 0.0002, "epoch": 1.9839142091152815, "step": 1480}, {"loss": 1.7133, "grad_norm": 0.38347867131233215, "learning_rate": 0.0002, "epoch": 1.997319034852547, "step": 1490}, {"eval_loss": 1.8150336742401123, "eval_runtime": 91.1797, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.713, "epoch": 2.0, "step": 1492}, {"loss": 1.6673, "grad_norm": 0.3648935854434967, "learning_rate": 0.0002, "epoch": 2.0107238605898123, "step": 1500}, {"loss": 1.6754, "grad_norm": 0.3521469533443451, "learning_rate": 0.0002, "epoch": 2.0241286863270775, "step": 1510}, {"loss": 1.5775, "grad_norm": 0.4275520145893097, "learning_rate": 0.0002, "epoch": 2.037533512064343, "step": 1520}, {"loss": 1.5932, "grad_norm": 0.4140888750553131, "learning_rate": 0.0002, "epoch": 2.0509383378016084, "step": 1530}, {"loss": 1.6237, "grad_norm": 0.37715452909469604, "learning_rate": 0.0002, "epoch": 2.064343163538874, "step": 1540}, {"loss": 1.6426, "grad_norm": 0.4375513195991516, "learning_rate": 0.0002, "epoch": 2.0777479892761392, "step": 1550}, {"loss": 1.6675, "grad_norm": 0.44963088631629944, "learning_rate": 0.0002, "epoch": 2.091152815013405, "step": 1560}, {"loss": 1.6731, "grad_norm": 0.45463916659355164, "learning_rate": 0.0002, "epoch": 2.10455764075067, "step": 1570}, {"loss": 1.5928, "grad_norm": 0.3952806293964386, "learning_rate": 0.0002, "epoch": 2.1179624664879357, "step": 1580}, {"loss": 1.6153, "grad_norm": 0.44873616099357605, "learning_rate": 0.0002, "epoch": 2.131367292225201, "step": 1590}, {"loss": 1.5953, "grad_norm": 0.45529067516326904, "learning_rate": 0.0002, "epoch": 2.1447721179624666, "step": 1600}, {"loss": 1.634, "grad_norm": 0.4483625590801239, "learning_rate": 0.0002, "epoch": 2.158176943699732, "step": 1610}, {"loss": 1.6202, "grad_norm": 0.3954690992832184, "learning_rate": 0.0002, "epoch": 2.1715817694369974, "step": 1620}, {"loss": 1.6657, "grad_norm": 0.4297006130218506, "learning_rate": 0.0002, "epoch": 2.1849865951742626, "step": 1630}, {"loss": 1.5499, "grad_norm": 0.4121869206428528, "learning_rate": 0.0002, "epoch": 2.1983914209115283, "step": 1640}, {"loss": 1.6017, "grad_norm": 0.45843517780303955, "learning_rate": 0.0002, "epoch": 2.2117962466487935, "step": 1650}, {"loss": 1.6699, "grad_norm": 0.44742295145988464, "learning_rate": 0.0002, "epoch": 2.225201072386059, "step": 1660}, {"loss": 1.6879, "grad_norm": 0.500198483467102, "learning_rate": 0.0002, "epoch": 2.2386058981233243, "step": 1670}, {"loss": 1.6362, "grad_norm": 0.4322265386581421, "learning_rate": 0.0002, "epoch": 2.25201072386059, "step": 1680}, {"loss": 1.6486, "grad_norm": 0.480289101600647, "learning_rate": 0.0002, "epoch": 2.265415549597855, "step": 1690}, {"loss": 1.6396, "grad_norm": 0.4532500207424164, "learning_rate": 0.0002, "epoch": 2.278820375335121, "step": 1700}, {"loss": 1.6088, "grad_norm": 0.41848474740982056, "learning_rate": 0.0002, "epoch": 2.292225201072386, "step": 1710}, {"loss": 1.6447, "grad_norm": 0.47211962938308716, "learning_rate": 0.0002, "epoch": 2.3056300268096512, "step": 1720}, {"loss": 1.7174, "grad_norm": 0.4273032248020172, "learning_rate": 0.0002, "epoch": 2.319034852546917, "step": 1730}, {"loss": 1.617, "grad_norm": 0.4660373330116272, "learning_rate": 0.0002, "epoch": 2.3324396782841825, "step": 1740}, {"loss": 1.6036, "grad_norm": 0.4409862756729126, "learning_rate": 0.0002, "epoch": 2.3458445040214477, "step": 1750}, {"loss": 1.6579, "grad_norm": 0.44795849919319153, "learning_rate": 0.0002, "epoch": 2.359249329758713, "step": 1760}, {"loss": 1.5736, "grad_norm": 0.4470100402832031, "learning_rate": 0.0002, "epoch": 2.3726541554959786, "step": 1770}, {"loss": 1.6277, "grad_norm": 0.4184521436691284, "learning_rate": 0.0002, "epoch": 2.386058981233244, "step": 1780}, {"loss": 1.6654, "grad_norm": 0.4572308659553528, "learning_rate": 0.0002, "epoch": 2.3994638069705094, "step": 1790}, {"loss": 1.6714, "grad_norm": 0.4888782501220703, "learning_rate": 0.0002, "epoch": 2.4128686327077746, "step": 1800}, {"loss": 1.7168, "grad_norm": 0.4442083239555359, "learning_rate": 0.0002, "epoch": 2.4262734584450403, "step": 1810}, {"loss": 1.6375, "grad_norm": 0.4986329972743988, "learning_rate": 0.0002, "epoch": 2.4396782841823055, "step": 1820}, {"loss": 1.6881, "grad_norm": 0.47918054461479187, "learning_rate": 0.0002, "epoch": 2.453083109919571, "step": 1830}, {"loss": 1.5969, "grad_norm": 0.42569679021835327, "learning_rate": 0.0002, "epoch": 2.4664879356568363, "step": 1840}, {"loss": 1.5751, "grad_norm": 0.4683821201324463, "learning_rate": 0.0002, "epoch": 2.479892761394102, "step": 1850}, {"loss": 1.6004, "grad_norm": 0.43605074286460876, "learning_rate": 0.0002, "epoch": 2.493297587131367, "step": 1860}, {"loss": 1.6885, "grad_norm": 0.4189167618751526, "learning_rate": 0.0002, "epoch": 2.506702412868633, "step": 1870}, {"loss": 1.6493, "grad_norm": 0.5860861539840698, "learning_rate": 0.0002, "epoch": 2.520107238605898, "step": 1880}, {"loss": 1.6563, "grad_norm": 0.4568740427494049, "learning_rate": 0.0002, "epoch": 2.5335120643431637, "step": 1890}, {"loss": 1.6653, "grad_norm": 0.4672846496105194, "learning_rate": 0.0002, "epoch": 2.546916890080429, "step": 1900}, {"loss": 1.6037, "grad_norm": 0.4280472993850708, "learning_rate": 0.0002, "epoch": 2.5603217158176945, "step": 1910}, {"loss": 1.5721, "grad_norm": 0.590728759765625, "learning_rate": 0.0002, "epoch": 2.5737265415549597, "step": 1920}, {"loss": 1.6567, "grad_norm": 0.4205126166343689, "learning_rate": 0.0002, "epoch": 2.5871313672922254, "step": 1930}, {"loss": 1.5045, "grad_norm": 0.47869905829429626, "learning_rate": 0.0002, "epoch": 2.6005361930294906, "step": 1940}, {"loss": 1.5973, "grad_norm": 0.4607323408126831, "learning_rate": 0.0002, "epoch": 2.6139410187667558, "step": 1950}, {"loss": 1.644, "grad_norm": 0.4762210547924042, "learning_rate": 0.0002, "epoch": 2.6273458445040214, "step": 1960}, {"loss": 1.6316, "grad_norm": 0.46832647919654846, "learning_rate": 0.0002, "epoch": 2.640750670241287, "step": 1970}, {"loss": 1.6591, "grad_norm": 0.4368574619293213, "learning_rate": 0.0002, "epoch": 2.6541554959785523, "step": 1980}, {"loss": 1.6359, "grad_norm": 0.5248273611068726, "learning_rate": 0.0002, "epoch": 2.6675603217158175, "step": 1990}, {"loss": 1.6879, "grad_norm": 0.46777117252349854, "learning_rate": 0.0002, "epoch": 2.680965147453083, "step": 2000}, {"loss": 1.7248, "grad_norm": 0.5201858878135681, "learning_rate": 0.0002, "epoch": 2.6943699731903488, "step": 2010}, {"loss": 1.6337, "grad_norm": 0.46777284145355225, "learning_rate": 0.0002, "epoch": 2.707774798927614, "step": 2020}, {"loss": 1.6369, "grad_norm": 0.46736642718315125, "learning_rate": 0.0002, "epoch": 2.721179624664879, "step": 2030}, {"loss": 1.6356, "grad_norm": 0.4647925794124603, "learning_rate": 0.0002, "epoch": 2.734584450402145, "step": 2040}, {"loss": 1.732, "grad_norm": 0.4298803508281708, "learning_rate": 0.0002, "epoch": 2.7479892761394105, "step": 2050}, {"loss": 1.6648, "grad_norm": 0.45485609769821167, "learning_rate": 0.0002, "epoch": 2.7613941018766757, "step": 2060}, {"loss": 1.6706, "grad_norm": 0.43687865138053894, "learning_rate": 0.0002, "epoch": 2.774798927613941, "step": 2070}, {"loss": 1.6904, "grad_norm": 0.4319164752960205, "learning_rate": 0.0002, "epoch": 2.7882037533512065, "step": 2080}, {"loss": 1.6531, "grad_norm": 0.47792428731918335, "learning_rate": 0.0002, "epoch": 2.8016085790884717, "step": 2090}, {"loss": 1.6417, "grad_norm": 0.5322234034538269, "learning_rate": 0.0002, "epoch": 2.8150134048257374, "step": 2100}, {"loss": 1.6634, "grad_norm": 0.47517943382263184, "learning_rate": 0.0002, "epoch": 2.8284182305630026, "step": 2110}, {"loss": 1.6329, "grad_norm": 0.45799025893211365, "learning_rate": 0.0002, "epoch": 2.841823056300268, "step": 2120}, {"loss": 1.6594, "grad_norm": 0.45852357149124146, "learning_rate": 0.0002, "epoch": 2.8552278820375334, "step": 2130}, {"loss": 1.61, "grad_norm": 0.4617408514022827, "learning_rate": 0.0002, "epoch": 2.868632707774799, "step": 2140}, {"loss": 1.6445, "grad_norm": 0.44205963611602783, "learning_rate": 0.0002, "epoch": 2.8820375335120643, "step": 2150}, {"loss": 1.6231, "grad_norm": 0.47173425555229187, "learning_rate": 0.0002, "epoch": 2.89544235924933, "step": 2160}, {"loss": 1.6425, "grad_norm": 0.46379899978637695, "learning_rate": 0.0002, "epoch": 2.908847184986595, "step": 2170}, {"loss": 1.6403, "grad_norm": 0.4999759793281555, "learning_rate": 0.0002, "epoch": 2.9222520107238603, "step": 2180}, {"loss": 1.6741, "grad_norm": 0.4607947766780853, "learning_rate": 0.0002, "epoch": 2.935656836461126, "step": 2190}, {"loss": 1.6889, "grad_norm": 0.4359836280345917, "learning_rate": 0.0002, "epoch": 2.9490616621983916, "step": 2200}, {"loss": 1.6478, "grad_norm": 0.5195549726486206, "learning_rate": 0.0002, "epoch": 2.962466487935657, "step": 2210}, {"loss": 1.6348, "grad_norm": 0.4914056062698364, "learning_rate": 0.0002, "epoch": 2.975871313672922, "step": 2220}, {"loss": 1.6594, "grad_norm": 0.4647377133369446, "learning_rate": 0.0002, "epoch": 2.9892761394101877, "step": 2230}, {"eval_loss": 1.8368606567382812, "eval_runtime": 90.5623, "eval_samples_per_second": 5.687, "eval_steps_per_second": 0.718, "epoch": 3.0, "step": 2238}, {"loss": 1.5704, "grad_norm": 0.40689945220947266, "learning_rate": 0.0002, "epoch": 3.002680965147453, "step": 2240}, {"loss": 1.5961, "grad_norm": 0.4699273705482483, "learning_rate": 0.0002, "epoch": 3.0160857908847185, "step": 2250}, {"loss": 1.5182, "grad_norm": 0.5531830787658691, "learning_rate": 0.0002, "epoch": 3.0294906166219837, "step": 2260}, {"loss": 1.4924, "grad_norm": 0.5441790223121643, "learning_rate": 0.0002, "epoch": 3.0428954423592494, "step": 2270}, {"loss": 1.4953, "grad_norm": 0.6145012974739075, "learning_rate": 0.0002, "epoch": 3.0563002680965146, "step": 2280}, {"loss": 1.4861, "grad_norm": 0.6997102499008179, "learning_rate": 0.0002, "epoch": 3.06970509383378, "step": 2290}, {"loss": 1.5853, "grad_norm": 0.6082330942153931, "learning_rate": 0.0002, "epoch": 3.0831099195710454, "step": 2300}, {"loss": 1.5377, "grad_norm": 0.5294155478477478, "learning_rate": 0.0002, "epoch": 3.096514745308311, "step": 2310}, {"loss": 1.5452, "grad_norm": 0.7200340032577515, "learning_rate": 0.0002, "epoch": 3.1099195710455763, "step": 2320}, {"loss": 1.5296, "grad_norm": 0.721092939376831, "learning_rate": 0.0002, "epoch": 3.123324396782842, "step": 2330}, {"loss": 1.5307, "grad_norm": 0.5344305038452148, "learning_rate": 0.0002, "epoch": 3.136729222520107, "step": 2340}, {"loss": 1.4347, "grad_norm": 0.5533145070075989, "learning_rate": 0.0002, "epoch": 3.1501340482573728, "step": 2350}, {"loss": 1.529, "grad_norm": 0.5976856350898743, "learning_rate": 0.0002, "epoch": 3.163538873994638, "step": 2360}, {"loss": 1.6044, "grad_norm": 0.4974960386753082, "learning_rate": 0.0002, "epoch": 3.1769436997319036, "step": 2370}, {"loss": 1.5554, "grad_norm": 0.6377840042114258, "learning_rate": 0.0002, "epoch": 3.190348525469169, "step": 2380}, {"loss": 1.5322, "grad_norm": 0.5447293519973755, "learning_rate": 0.0002, "epoch": 3.2037533512064345, "step": 2390}, {"loss": 1.5127, "grad_norm": 0.49577030539512634, "learning_rate": 0.0002, "epoch": 3.2171581769436997, "step": 2400}, {"loss": 1.4768, "grad_norm": 0.5588275790214539, "learning_rate": 0.0002, "epoch": 3.2305630026809653, "step": 2410}, {"loss": 1.4755, "grad_norm": 0.6429149508476257, "learning_rate": 0.0002, "epoch": 3.2439678284182305, "step": 2420}, {"loss": 1.5596, "grad_norm": 0.5713154673576355, "learning_rate": 0.0002, "epoch": 3.257372654155496, "step": 2430}, {"loss": 1.4763, "grad_norm": 0.6348955035209656, "learning_rate": 0.0002, "epoch": 3.2707774798927614, "step": 2440}, {"loss": 1.509, "grad_norm": 0.5675528645515442, "learning_rate": 0.0002, "epoch": 3.284182305630027, "step": 2450}, {"loss": 1.5867, "grad_norm": 0.5570188164710999, "learning_rate": 0.0002, "epoch": 3.297587131367292, "step": 2460}, {"loss": 1.554, "grad_norm": 0.6029602289199829, "learning_rate": 0.0002, "epoch": 3.310991957104558, "step": 2470}, {"loss": 1.5094, "grad_norm": 0.523206353187561, "learning_rate": 0.0002, "epoch": 3.324396782841823, "step": 2480}, {"loss": 1.4854, "grad_norm": 0.5912408828735352, "learning_rate": 0.0002, "epoch": 3.3378016085790883, "step": 2490}, {"loss": 1.5097, "grad_norm": 0.5524865984916687, "learning_rate": 0.0002, "epoch": 3.351206434316354, "step": 2500}, {"loss": 1.5064, "grad_norm": 0.60386061668396, "learning_rate": 0.0002, "epoch": 3.3646112600536195, "step": 2510}, {"loss": 1.564, "grad_norm": 0.5838595628738403, "learning_rate": 0.0002, "epoch": 3.3780160857908847, "step": 2520}, {"loss": 1.4615, "grad_norm": 0.5400974154472351, "learning_rate": 0.0002, "epoch": 3.39142091152815, "step": 2530}, {"loss": 1.5349, "grad_norm": 0.6150162220001221, "learning_rate": 0.0002, "epoch": 3.4048257372654156, "step": 2540}, {"loss": 1.5978, "grad_norm": 0.5279412269592285, "learning_rate": 0.0002, "epoch": 3.418230563002681, "step": 2550}, {"loss": 1.5063, "grad_norm": 0.5974063873291016, "learning_rate": 0.0002, "epoch": 3.4316353887399464, "step": 2560}, {"loss": 1.5825, "grad_norm": 0.661573052406311, "learning_rate": 0.0002, "epoch": 3.4450402144772116, "step": 2570}, {"loss": 1.5204, "grad_norm": 0.577880322933197, "learning_rate": 0.0002, "epoch": 3.4584450402144773, "step": 2580}, {"loss": 1.5295, "grad_norm": 0.5532318949699402, "learning_rate": 0.0002, "epoch": 3.4718498659517425, "step": 2590}, {"loss": 1.4933, "grad_norm": 0.5764921307563782, "learning_rate": 0.0002, "epoch": 3.485254691689008, "step": 2600}, {"loss": 1.4355, "grad_norm": 0.6145682334899902, "learning_rate": 0.0002, "epoch": 3.4986595174262733, "step": 2610}, {"loss": 1.4968, "grad_norm": 0.6561126112937927, "learning_rate": 0.0002, "epoch": 3.512064343163539, "step": 2620}, {"loss": 1.5309, "grad_norm": 0.5673288106918335, "learning_rate": 0.0002, "epoch": 3.525469168900804, "step": 2630}, {"loss": 1.5274, "grad_norm": 0.6215338706970215, "learning_rate": 0.0002, "epoch": 3.53887399463807, "step": 2640}, {"loss": 1.5117, "grad_norm": 0.5512040853500366, "learning_rate": 0.0002, "epoch": 3.552278820375335, "step": 2650}, {"loss": 1.5188, "grad_norm": 0.49503496289253235, "learning_rate": 0.0002, "epoch": 3.5656836461126007, "step": 2660}, {"loss": 1.524, "grad_norm": 0.5714912414550781, "learning_rate": 0.0002, "epoch": 3.579088471849866, "step": 2670}, {"loss": 1.4651, "grad_norm": 0.6883154511451721, "learning_rate": 0.0002, "epoch": 3.592493297587131, "step": 2680}, {"loss": 1.5174, "grad_norm": 0.5989556908607483, "learning_rate": 0.0002, "epoch": 3.6058981233243967, "step": 2690}, {"loss": 1.5335, "grad_norm": 0.630268394947052, "learning_rate": 0.0002, "epoch": 3.6193029490616624, "step": 2700}, {"loss": 1.4681, "grad_norm": 0.5819358229637146, "learning_rate": 0.0002, "epoch": 3.6327077747989276, "step": 2710}, {"loss": 1.5676, "grad_norm": 0.6102097034454346, "learning_rate": 0.0002, "epoch": 3.646112600536193, "step": 2720}, {"loss": 1.5566, "grad_norm": 0.6858501434326172, "learning_rate": 0.0002, "epoch": 3.6595174262734584, "step": 2730}, {"loss": 1.5242, "grad_norm": 0.6328608393669128, "learning_rate": 0.0002, "epoch": 3.672922252010724, "step": 2740}, {"loss": 1.5211, "grad_norm": 0.5366981029510498, "learning_rate": 0.0002, "epoch": 3.6863270777479893, "step": 2750}, {"loss": 1.5532, "grad_norm": 0.7048938274383545, "learning_rate": 0.0002, "epoch": 3.6997319034852545, "step": 2760}, {"loss": 1.5001, "grad_norm": 0.5371938347816467, "learning_rate": 0.0002, "epoch": 3.71313672922252, "step": 2770}, {"loss": 1.557, "grad_norm": 0.6142212152481079, "learning_rate": 0.0002, "epoch": 3.726541554959786, "step": 2780}, {"loss": 1.5191, "grad_norm": 0.6164522171020508, "learning_rate": 0.0002, "epoch": 3.739946380697051, "step": 2790}, {"loss": 1.5071, "grad_norm": 0.7511836886405945, "learning_rate": 0.0002, "epoch": 3.753351206434316, "step": 2800}, {"loss": 1.5775, "grad_norm": 0.6194717288017273, "learning_rate": 0.0002, "epoch": 3.766756032171582, "step": 2810}, {"loss": 1.5721, "grad_norm": 0.676721453666687, "learning_rate": 0.0002, "epoch": 3.780160857908847, "step": 2820}, {"loss": 1.502, "grad_norm": 0.5646911263465881, "learning_rate": 0.0002, "epoch": 3.7935656836461127, "step": 2830}, {"loss": 1.4871, "grad_norm": 0.5874826908111572, "learning_rate": 0.0002, "epoch": 3.806970509383378, "step": 2840}, {"loss": 1.5046, "grad_norm": 0.6395232677459717, "learning_rate": 0.0002, "epoch": 3.8203753351206435, "step": 2850}, {"loss": 1.5088, "grad_norm": 0.624563992023468, "learning_rate": 0.0002, "epoch": 3.8337801608579087, "step": 2860}, {"loss": 1.479, "grad_norm": 0.59019935131073, "learning_rate": 0.0002, "epoch": 3.8471849865951744, "step": 2870}, {"loss": 1.4693, "grad_norm": 0.6700479984283447, "learning_rate": 0.0002, "epoch": 3.8605898123324396, "step": 2880}, {"loss": 1.5032, "grad_norm": 0.6131282448768616, "learning_rate": 0.0002, "epoch": 3.8739946380697052, "step": 2890}, {"loss": 1.5446, "grad_norm": 0.6807777881622314, "learning_rate": 0.0002, "epoch": 3.8873994638069704, "step": 2900}, {"loss": 1.5618, "grad_norm": 0.5297217965126038, "learning_rate": 0.0002, "epoch": 3.900804289544236, "step": 2910}, {"loss": 1.5046, "grad_norm": 0.5795540809631348, "learning_rate": 0.0002, "epoch": 3.9142091152815013, "step": 2920}, {"loss": 1.5155, "grad_norm": 0.5549747347831726, "learning_rate": 0.0002, "epoch": 3.927613941018767, "step": 2930}, {"loss": 1.5932, "grad_norm": 0.5895092487335205, "learning_rate": 0.0002, "epoch": 3.941018766756032, "step": 2940}, {"loss": 1.5831, "grad_norm": 0.590002715587616, "learning_rate": 0.0002, "epoch": 3.9544235924932973, "step": 2950}, {"loss": 1.592, "grad_norm": 0.7847695350646973, "learning_rate": 0.0002, "epoch": 3.967828418230563, "step": 2960}, {"loss": 1.4892, "grad_norm": 0.5845848321914673, "learning_rate": 0.0002, "epoch": 3.9812332439678286, "step": 2970}, {"loss": 1.5094, "grad_norm": 0.5861571431159973, "learning_rate": 0.0002, "epoch": 3.994638069705094, "step": 2980}, {"eval_loss": 1.8821998834609985, "eval_runtime": 90.8701, "eval_samples_per_second": 5.667, "eval_steps_per_second": 0.715, "epoch": 4.0, "step": 2984}, {"loss": 1.4156, "grad_norm": 0.6209918260574341, "learning_rate": 0.0002, "epoch": 4.008042895442359, "step": 2990}, {"loss": 1.4244, "grad_norm": 0.607226550579071, "learning_rate": 0.0002, "epoch": 4.021447721179625, "step": 3000}, {"loss": 1.3652, "grad_norm": 0.6677961349487305, "learning_rate": 0.0002, "epoch": 4.03485254691689, "step": 3010}, {"loss": 1.3815, "grad_norm": 0.9053248763084412, "learning_rate": 0.0002, "epoch": 4.048257372654155, "step": 3020}, {"loss": 1.4346, "grad_norm": 0.6815084218978882, "learning_rate": 0.0002, "epoch": 4.061662198391421, "step": 3030}, {"loss": 1.3, "grad_norm": 0.6709407567977905, "learning_rate": 0.0002, "epoch": 4.075067024128686, "step": 3040}, {"loss": 1.3406, "grad_norm": 0.728184163570404, "learning_rate": 0.0002, "epoch": 4.088471849865952, "step": 3050}, {"loss": 1.3404, "grad_norm": 0.817628800868988, "learning_rate": 0.0002, "epoch": 4.101876675603217, "step": 3060}, {"loss": 1.3496, "grad_norm": 0.7384206056594849, "learning_rate": 0.0002, "epoch": 4.115281501340482, "step": 3070}, {"loss": 1.3621, "grad_norm": 0.7380280494689941, "learning_rate": 0.0002, "epoch": 4.128686327077748, "step": 3080}, {"loss": 1.3425, "grad_norm": 0.8197277188301086, "learning_rate": 0.0002, "epoch": 4.142091152815014, "step": 3090}, {"loss": 1.3761, "grad_norm": 0.8971617817878723, "learning_rate": 0.0002, "epoch": 4.1554959785522785, "step": 3100}, {"loss": 1.3564, "grad_norm": 0.7409387826919556, "learning_rate": 0.0002, "epoch": 4.168900804289544, "step": 3110}, {"loss": 1.3675, "grad_norm": 0.6948909163475037, "learning_rate": 0.0002, "epoch": 4.18230563002681, "step": 3120}, {"loss": 1.3397, "grad_norm": 0.7619595527648926, "learning_rate": 0.0002, "epoch": 4.195710455764075, "step": 3130}, {"loss": 1.3864, "grad_norm": 0.7657106518745422, "learning_rate": 0.0002, "epoch": 4.20911528150134, "step": 3140}, {"loss": 1.4017, "grad_norm": 0.6919401288032532, "learning_rate": 0.0002, "epoch": 4.222520107238606, "step": 3150}, {"loss": 1.3692, "grad_norm": 0.6991415023803711, "learning_rate": 0.0002, "epoch": 4.2359249329758715, "step": 3160}, {"loss": 1.3651, "grad_norm": 0.7349252700805664, "learning_rate": 0.0002, "epoch": 4.249329758713137, "step": 3170}, {"loss": 1.367, "grad_norm": 0.8838240504264832, "learning_rate": 0.0002, "epoch": 4.262734584450402, "step": 3180}, {"loss": 1.4254, "grad_norm": 0.7240107655525208, "learning_rate": 0.0002, "epoch": 4.2761394101876675, "step": 3190}, {"loss": 1.3671, "grad_norm": 0.7338636517524719, "learning_rate": 0.0002, "epoch": 4.289544235924933, "step": 3200}, {"loss": 1.448, "grad_norm": 0.7891436815261841, "learning_rate": 0.0002, "epoch": 4.302949061662199, "step": 3210}, {"loss": 1.3291, "grad_norm": 0.7407845854759216, "learning_rate": 0.0002, "epoch": 4.316353887399464, "step": 3220}, {"loss": 1.3899, "grad_norm": 0.7635948061943054, "learning_rate": 0.0002, "epoch": 4.329758713136729, "step": 3230}, {"loss": 1.3384, "grad_norm": 0.7478461861610413, "learning_rate": 0.0002, "epoch": 4.343163538873995, "step": 3240}, {"loss": 1.388, "grad_norm": 0.7684298157691956, "learning_rate": 0.0002, "epoch": 4.35656836461126, "step": 3250}, {"loss": 1.4233, "grad_norm": 1.0287525653839111, "learning_rate": 0.0002, "epoch": 4.369973190348525, "step": 3260}, {"loss": 1.3542, "grad_norm": 0.750616192817688, "learning_rate": 0.0002, "epoch": 4.383378016085791, "step": 3270}, {"loss": 1.3158, "grad_norm": 0.7911648750305176, "learning_rate": 0.0002, "epoch": 4.396782841823057, "step": 3280}, {"loss": 1.3896, "grad_norm": 0.9156750440597534, "learning_rate": 0.0002, "epoch": 4.410187667560321, "step": 3290}, {"loss": 1.3887, "grad_norm": 1.0180249214172363, "learning_rate": 0.0002, "epoch": 4.423592493297587, "step": 3300}, {"loss": 1.4143, "grad_norm": 1.0792218446731567, "learning_rate": 0.0002, "epoch": 4.436997319034853, "step": 3310}, {"loss": 1.3314, "grad_norm": 0.8027488589286804, "learning_rate": 0.0002, "epoch": 4.450402144772118, "step": 3320}, {"loss": 1.4144, "grad_norm": 0.8037815093994141, "learning_rate": 0.0002, "epoch": 4.463806970509383, "step": 3330}, {"loss": 1.4124, "grad_norm": 0.7907946705818176, "learning_rate": 0.0002, "epoch": 4.477211796246649, "step": 3340}, {"loss": 1.443, "grad_norm": 0.7206302881240845, "learning_rate": 0.0002, "epoch": 4.490616621983914, "step": 3350}, {"loss": 1.3822, "grad_norm": 0.7697674632072449, "learning_rate": 0.0002, "epoch": 4.50402144772118, "step": 3360}, {"loss": 1.3923, "grad_norm": 0.7315130829811096, "learning_rate": 0.0002, "epoch": 4.517426273458445, "step": 3370}, {"loss": 1.3598, "grad_norm": 0.7896273136138916, "learning_rate": 0.0002, "epoch": 4.53083109919571, "step": 3380}, {"loss": 1.3947, "grad_norm": 0.7720345258712769, "learning_rate": 0.0002, "epoch": 4.544235924932976, "step": 3390}, {"loss": 1.404, "grad_norm": 0.8304631114006042, "learning_rate": 0.0002, "epoch": 4.557640750670242, "step": 3400}, {"loss": 1.3712, "grad_norm": 0.7408214211463928, "learning_rate": 0.0002, "epoch": 4.571045576407506, "step": 3410}, {"loss": 1.3957, "grad_norm": 0.8100157976150513, "learning_rate": 0.0002, "epoch": 4.584450402144772, "step": 3420}, {"loss": 1.47, "grad_norm": 0.7829574942588806, "learning_rate": 0.0002, "epoch": 4.597855227882038, "step": 3430}, {"loss": 1.3684, "grad_norm": 0.9529728889465332, "learning_rate": 0.0002, "epoch": 4.6112600536193025, "step": 3440}, {"loss": 1.3984, "grad_norm": 1.0769460201263428, "learning_rate": 0.0002, "epoch": 4.624664879356568, "step": 3450}, {"loss": 1.4063, "grad_norm": 0.8941947817802429, "learning_rate": 0.0002, "epoch": 4.638069705093834, "step": 3460}, {"loss": 1.4421, "grad_norm": 0.7860096096992493, "learning_rate": 0.0002, "epoch": 4.651474530831099, "step": 3470}, {"loss": 1.3782, "grad_norm": 0.8184044361114502, "learning_rate": 0.0002, "epoch": 4.664879356568365, "step": 3480}, {"loss": 1.3885, "grad_norm": 0.7852717638015747, "learning_rate": 0.0002, "epoch": 4.67828418230563, "step": 3490}, {"loss": 1.4139, "grad_norm": 0.750586986541748, "learning_rate": 0.0002, "epoch": 4.6916890080428955, "step": 3500}, {"loss": 1.3224, "grad_norm": 0.7966068983078003, "learning_rate": 0.0002, "epoch": 4.705093833780161, "step": 3510}, {"loss": 1.4052, "grad_norm": 0.8387030959129333, "learning_rate": 0.0002, "epoch": 4.718498659517426, "step": 3520}, {"loss": 1.4541, "grad_norm": 0.7373180389404297, "learning_rate": 0.0002, "epoch": 4.7319034852546915, "step": 3530}, {"loss": 1.4148, "grad_norm": 0.8415353894233704, "learning_rate": 0.0002, "epoch": 4.745308310991957, "step": 3540}, {"loss": 1.4236, "grad_norm": 0.7155488133430481, "learning_rate": 0.0002, "epoch": 4.758713136729223, "step": 3550}, {"loss": 1.3454, "grad_norm": 0.697658896446228, "learning_rate": 0.0002, "epoch": 4.772117962466488, "step": 3560}, {"loss": 1.4002, "grad_norm": 0.8722999095916748, "learning_rate": 0.0002, "epoch": 4.785522788203753, "step": 3570}, {"loss": 1.4224, "grad_norm": 0.8106381297111511, "learning_rate": 0.0002, "epoch": 4.798927613941019, "step": 3580}, {"loss": 1.3525, "grad_norm": 0.9320500493049622, "learning_rate": 0.0002, "epoch": 4.8123324396782845, "step": 3590}, {"loss": 1.3675, "grad_norm": 0.7583016157150269, "learning_rate": 0.0002, "epoch": 4.825737265415549, "step": 3600}, {"loss": 1.3761, "grad_norm": 0.790050208568573, "learning_rate": 0.0002, "epoch": 4.839142091152815, "step": 3610}, {"loss": 1.4144, "grad_norm": 0.7481580972671509, "learning_rate": 0.0002, "epoch": 4.8525469168900806, "step": 3620}, {"loss": 1.4424, "grad_norm": 0.8709374666213989, "learning_rate": 0.0002, "epoch": 4.865951742627346, "step": 3630}, {"loss": 1.3758, "grad_norm": 0.7266733050346375, "learning_rate": 0.0002, "epoch": 4.879356568364611, "step": 3640}, {"loss": 1.4254, "grad_norm": 0.7669504880905151, "learning_rate": 0.0002, "epoch": 4.892761394101877, "step": 3650}, {"loss": 1.3956, "grad_norm": 0.7855764627456665, "learning_rate": 0.0002, "epoch": 4.906166219839142, "step": 3660}, {"loss": 1.4609, "grad_norm": 0.8145440816879272, "learning_rate": 0.0002, "epoch": 4.919571045576408, "step": 3670}, {"loss": 1.4152, "grad_norm": 0.7487278580665588, "learning_rate": 0.0002, "epoch": 4.932975871313673, "step": 3680}, {"loss": 1.4386, "grad_norm": 0.8390981554985046, "learning_rate": 0.0002, "epoch": 4.946380697050938, "step": 3690}, {"loss": 1.3504, "grad_norm": 0.663752555847168, "learning_rate": 0.0002, "epoch": 4.959785522788204, "step": 3700}, {"loss": 1.3453, "grad_norm": 0.7821969985961914, "learning_rate": 0.0002, "epoch": 4.973190348525469, "step": 3710}, {"loss": 1.3936, "grad_norm": 0.9157266020774841, "learning_rate": 0.0002, "epoch": 4.986595174262734, "step": 3720}, {"loss": 1.3925, "grad_norm": 0.7683535814285278, "learning_rate": 0.0002, "epoch": 5.0, "step": 3730}, {"eval_loss": 1.9639414548873901, "eval_runtime": 92.0173, "eval_samples_per_second": 5.597, "eval_steps_per_second": 0.706, "epoch": 5.0, "step": 3730}, {"loss": 1.1852, "grad_norm": 1.3000373840332031, "learning_rate": 0.0002, "epoch": 5.013404825737266, "step": 3740}, {"loss": 1.1922, "grad_norm": 0.8916982412338257, "learning_rate": 0.0002, "epoch": 5.02680965147453, "step": 3750}, {"loss": 1.2113, "grad_norm": 1.0365116596221924, "learning_rate": 0.0002, "epoch": 5.040214477211796, "step": 3760}, {"loss": 1.2941, "grad_norm": 0.999420166015625, "learning_rate": 0.0002, "epoch": 5.053619302949062, "step": 3770}, {"loss": 1.24, "grad_norm": 1.093572974205017, "learning_rate": 0.0002, "epoch": 5.067024128686327, "step": 3780}, {"loss": 1.2345, "grad_norm": 1.1137515306472778, "learning_rate": 0.0002, "epoch": 5.080428954423592, "step": 3790}, {"loss": 1.1646, "grad_norm": 1.0328283309936523, "learning_rate": 0.0002, "epoch": 5.093833780160858, "step": 3800}, {"loss": 1.1716, "grad_norm": 1.0444108247756958, "learning_rate": 0.0002, "epoch": 5.107238605898123, "step": 3810}, {"loss": 1.2226, "grad_norm": 0.858148992061615, "learning_rate": 0.0002, "epoch": 5.120643431635389, "step": 3820}, {"loss": 1.1691, "grad_norm": 0.94026780128479, "learning_rate": 0.0002, "epoch": 5.134048257372654, "step": 3830}, {"loss": 1.1902, "grad_norm": 0.8987152576446533, "learning_rate": 0.0002, "epoch": 5.1474530831099194, "step": 3840}, {"loss": 1.1562, "grad_norm": 0.922997236251831, "learning_rate": 0.0002, "epoch": 5.160857908847185, "step": 3850}, {"loss": 1.2072, "grad_norm": 0.9172422289848328, "learning_rate": 0.0002, "epoch": 5.174262734584451, "step": 3860}, {"loss": 1.1802, "grad_norm": 1.02277672290802, "learning_rate": 0.0002, "epoch": 5.1876675603217155, "step": 3870}, {"loss": 1.2206, "grad_norm": 1.093826413154602, "learning_rate": 0.0002, "epoch": 5.201072386058981, "step": 3880}, {"loss": 1.2578, "grad_norm": 0.9362447261810303, "learning_rate": 0.0002, "epoch": 5.214477211796247, "step": 3890}, {"loss": 1.2335, "grad_norm": 1.0564044713974, "learning_rate": 0.0002, "epoch": 5.227882037533512, "step": 3900}, {"loss": 1.1936, "grad_norm": 0.869575023651123, "learning_rate": 0.0002, "epoch": 5.241286863270777, "step": 3910}, {"loss": 1.2301, "grad_norm": 1.0383203029632568, "learning_rate": 0.0002, "epoch": 5.254691689008043, "step": 3920}, {"loss": 1.2076, "grad_norm": 0.9146919846534729, "learning_rate": 0.0002, "epoch": 5.2680965147453085, "step": 3930}, {"loss": 1.2804, "grad_norm": 0.9226430654525757, "learning_rate": 0.0002, "epoch": 5.281501340482574, "step": 3940}, {"loss": 1.2506, "grad_norm": 0.8703194260597229, "learning_rate": 0.0002, "epoch": 5.294906166219839, "step": 3950}, {"loss": 1.2533, "grad_norm": 1.0588284730911255, "learning_rate": 0.0002, "epoch": 5.3083109919571045, "step": 3960}, {"loss": 1.2405, "grad_norm": 1.1131688356399536, "learning_rate": 0.0002, "epoch": 5.32171581769437, "step": 3970}, {"loss": 1.1719, "grad_norm": 1.1073139905929565, "learning_rate": 0.0002, "epoch": 5.335120643431635, "step": 3980}, {"loss": 1.2375, "grad_norm": 0.9269049763679504, "learning_rate": 0.0002, "epoch": 5.348525469168901, "step": 3990}, {"loss": 1.2513, "grad_norm": 0.9802212715148926, "learning_rate": 0.0002, "epoch": 5.361930294906166, "step": 4000}, {"loss": 1.1573, "grad_norm": 0.9152148365974426, "learning_rate": 0.0002, "epoch": 5.375335120643432, "step": 4010}, {"loss": 1.2673, "grad_norm": 1.0395890474319458, "learning_rate": 0.0002, "epoch": 5.388739946380697, "step": 4020}, {"loss": 1.2228, "grad_norm": 1.0989106893539429, "learning_rate": 0.0002, "epoch": 5.402144772117962, "step": 4030}, {"loss": 1.2717, "grad_norm": 1.0305225849151611, "learning_rate": 0.0002, "epoch": 5.415549597855228, "step": 4040}, {"loss": 1.2751, "grad_norm": 0.8416915535926819, "learning_rate": 0.0002, "epoch": 5.428954423592494, "step": 4050}, {"loss": 1.2205, "grad_norm": 0.9120758175849915, "learning_rate": 0.0002, "epoch": 5.442359249329758, "step": 4060}, {"loss": 1.2812, "grad_norm": 1.197936773300171, "learning_rate": 0.0002, "epoch": 5.455764075067024, "step": 4070}, {"loss": 1.2346, "grad_norm": 1.0116125345230103, "learning_rate": 0.0002, "epoch": 5.46916890080429, "step": 4080}, {"loss": 1.1746, "grad_norm": 1.048995018005371, "learning_rate": 0.0002, "epoch": 5.482573726541555, "step": 4090}, {"loss": 1.1858, "grad_norm": 0.929185152053833, "learning_rate": 0.0002, "epoch": 5.49597855227882, "step": 4100}, {"loss": 1.3068, "grad_norm": 0.9064884781837463, "learning_rate": 0.0002, "epoch": 5.509383378016086, "step": 4110}, {"loss": 1.2481, "grad_norm": 1.2009892463684082, "learning_rate": 0.0002, "epoch": 5.522788203753351, "step": 4120}, {"loss": 1.2788, "grad_norm": 0.9054455161094666, "learning_rate": 0.0002, "epoch": 5.536193029490617, "step": 4130}, {"loss": 1.1624, "grad_norm": 0.9978497624397278, "learning_rate": 0.0002, "epoch": 5.549597855227882, "step": 4140}, {"loss": 1.2814, "grad_norm": 0.9779615998268127, "learning_rate": 0.0002, "epoch": 5.563002680965147, "step": 4150}, {"loss": 1.2361, "grad_norm": 1.0515185594558716, "learning_rate": 0.0002, "epoch": 5.576407506702413, "step": 4160}, {"loss": 1.2278, "grad_norm": 0.8618236184120178, "learning_rate": 0.0002, "epoch": 5.589812332439678, "step": 4170}, {"loss": 1.2853, "grad_norm": 0.9569384455680847, "learning_rate": 0.0002, "epoch": 5.603217158176943, "step": 4180}, {"loss": 1.2824, "grad_norm": 0.968923807144165, "learning_rate": 0.0002, "epoch": 5.616621983914209, "step": 4190}, {"loss": 1.3055, "grad_norm": 0.8759993314743042, "learning_rate": 0.0002, "epoch": 5.630026809651475, "step": 4200}, {"loss": 1.2912, "grad_norm": 0.9284833669662476, "learning_rate": 0.0002, "epoch": 5.64343163538874, "step": 4210}, {"loss": 1.2886, "grad_norm": 0.9293071031570435, "learning_rate": 0.0002, "epoch": 5.656836461126005, "step": 4220}, {"loss": 1.2704, "grad_norm": 0.9872161149978638, "learning_rate": 0.0002, "epoch": 5.670241286863271, "step": 4230}, {"loss": 1.2525, "grad_norm": 0.9545941948890686, "learning_rate": 0.0002, "epoch": 5.683646112600536, "step": 4240}, {"loss": 1.2639, "grad_norm": 1.0202341079711914, "learning_rate": 0.0002, "epoch": 5.697050938337801, "step": 4250}, {"loss": 1.2259, "grad_norm": 0.9821504950523376, "learning_rate": 0.0002, "epoch": 5.710455764075067, "step": 4260}, {"loss": 1.2243, "grad_norm": 1.0581456422805786, "learning_rate": 0.0002, "epoch": 5.7238605898123325, "step": 4270}, {"loss": 1.227, "grad_norm": 0.9639395475387573, "learning_rate": 0.0002, "epoch": 5.737265415549598, "step": 4280}, {"loss": 1.2849, "grad_norm": 2.205458164215088, "learning_rate": 0.0002, "epoch": 5.750670241286863, "step": 4290}, {"loss": 1.2785, "grad_norm": 1.0294393301010132, "learning_rate": 0.0002, "epoch": 5.7640750670241285, "step": 4300}, {"loss": 1.261, "grad_norm": 1.0360256433486938, "learning_rate": 0.0002, "epoch": 5.777479892761394, "step": 4310}, {"loss": 1.2891, "grad_norm": 0.9390154480934143, "learning_rate": 0.0002, "epoch": 5.79088471849866, "step": 4320}, {"loss": 1.248, "grad_norm": 0.9048963189125061, "learning_rate": 0.0002, "epoch": 5.804289544235925, "step": 4330}, {"loss": 1.2753, "grad_norm": 0.9310713410377502, "learning_rate": 0.0002, "epoch": 5.81769436997319, "step": 4340}, {"loss": 1.2393, "grad_norm": 1.038282871246338, "learning_rate": 0.0002, "epoch": 5.831099195710456, "step": 4350}, {"loss": 1.3398, "grad_norm": 0.9194827079772949, "learning_rate": 0.0002, "epoch": 5.8445040214477215, "step": 4360}, {"loss": 1.3049, "grad_norm": 0.9568411111831665, "learning_rate": 0.0002, "epoch": 5.857908847184986, "step": 4370}, {"loss": 1.2899, "grad_norm": 0.9088910818099976, "learning_rate": 0.0002, "epoch": 5.871313672922252, "step": 4380}, {"loss": 1.2497, "grad_norm": 1.0605647563934326, "learning_rate": 0.0002, "epoch": 5.884718498659518, "step": 4390}, {"loss": 1.2387, "grad_norm": 0.8016388416290283, "learning_rate": 0.0002, "epoch": 5.898123324396783, "step": 4400}, {"loss": 1.3046, "grad_norm": 1.0792853832244873, "learning_rate": 0.0002, "epoch": 5.911528150134048, "step": 4410}, {"loss": 1.282, "grad_norm": 1.059403657913208, "learning_rate": 0.0002, "epoch": 5.924932975871314, "step": 4420}, {"loss": 1.2524, "grad_norm": 0.87492436170578, "learning_rate": 0.0002, "epoch": 5.938337801608579, "step": 4430}, {"loss": 1.2373, "grad_norm": 1.0911097526550293, "learning_rate": 0.0002, "epoch": 5.951742627345844, "step": 4440}, {"loss": 1.3073, "grad_norm": 0.8860997557640076, "learning_rate": 0.0002, "epoch": 5.96514745308311, "step": 4450}, {"loss": 1.3273, "grad_norm": 0.9176826477050781, "learning_rate": 0.0002, "epoch": 5.978552278820375, "step": 4460}, {"loss": 1.2725, "grad_norm": 0.9018680453300476, "learning_rate": 0.0002, "epoch": 5.991957104557641, "step": 4470}, {"eval_loss": 2.0600433349609375, "eval_runtime": 92.2728, "eval_samples_per_second": 5.581, "eval_steps_per_second": 0.704, "epoch": 6.0, "step": 4476}, {"loss": 1.2019, "grad_norm": 0.8612148761749268, "learning_rate": 0.0002, "epoch": 6.005361930294906, "step": 4480}, {"loss": 1.1005, "grad_norm": 1.170229434967041, "learning_rate": 0.0002, "epoch": 6.018766756032171, "step": 4490}, {"loss": 1.0129, "grad_norm": 1.1005233526229858, "learning_rate": 0.0002, "epoch": 6.032171581769437, "step": 4500}, {"loss": 1.0936, "grad_norm": 1.1763442754745483, "learning_rate": 0.0002, "epoch": 6.045576407506703, "step": 4510}, {"loss": 0.9865, "grad_norm": 1.0595353841781616, "learning_rate": 0.0002, "epoch": 6.058981233243967, "step": 4520}, {"loss": 0.9543, "grad_norm": 1.3554084300994873, "learning_rate": 0.0002, "epoch": 6.072386058981233, "step": 4530}, {"loss": 1.0619, "grad_norm": 1.238821268081665, "learning_rate": 0.0002, "epoch": 6.085790884718499, "step": 4540}, {"loss": 1.0951, "grad_norm": 1.0496071577072144, "learning_rate": 0.0002, "epoch": 6.099195710455764, "step": 4550}, {"loss": 1.1128, "grad_norm": 1.3410215377807617, "learning_rate": 0.0002, "epoch": 6.112600536193029, "step": 4560}, {"loss": 1.0824, "grad_norm": 1.2559033632278442, "learning_rate": 0.0002, "epoch": 6.126005361930295, "step": 4570}, {"loss": 1.0645, "grad_norm": 1.2556545734405518, "learning_rate": 0.0002, "epoch": 6.13941018766756, "step": 4580}, {"loss": 1.1219, "grad_norm": 1.050678014755249, "learning_rate": 0.0002, "epoch": 6.152815013404826, "step": 4590}, {"loss": 1.0421, "grad_norm": 1.566770076751709, "learning_rate": 0.0002, "epoch": 6.166219839142091, "step": 4600}, {"loss": 1.0617, "grad_norm": 1.1482226848602295, "learning_rate": 0.0002, "epoch": 6.1796246648793565, "step": 4610}, {"loss": 1.0477, "grad_norm": 1.2731150388717651, "learning_rate": 0.0002, "epoch": 6.193029490616622, "step": 4620}, {"loss": 1.0291, "grad_norm": 1.4135994911193848, "learning_rate": 0.0002, "epoch": 6.206434316353888, "step": 4630}, {"loss": 1.0666, "grad_norm": 1.2925093173980713, "learning_rate": 0.0002, "epoch": 6.2198391420911525, "step": 4640}, {"loss": 1.0657, "grad_norm": 1.1199861764907837, "learning_rate": 0.0002, "epoch": 6.233243967828418, "step": 4650}, {"loss": 1.1143, "grad_norm": 1.2010078430175781, "learning_rate": 0.0002, "epoch": 6.246648793565684, "step": 4660}, {"loss": 1.1186, "grad_norm": 1.2655692100524902, "learning_rate": 0.0002, "epoch": 6.2600536193029495, "step": 4670}, {"loss": 1.0276, "grad_norm": 1.0960880517959595, "learning_rate": 0.0002, "epoch": 6.273458445040214, "step": 4680}, {"loss": 1.0576, "grad_norm": 1.170759916305542, "learning_rate": 0.0002, "epoch": 6.28686327077748, "step": 4690}, {"loss": 1.0852, "grad_norm": 1.1199755668640137, "learning_rate": 0.0002, "epoch": 6.3002680965147455, "step": 4700}, {"loss": 1.0171, "grad_norm": 1.1477710008621216, "learning_rate": 0.0002, "epoch": 6.31367292225201, "step": 4710}, {"loss": 1.0411, "grad_norm": 1.0862090587615967, "learning_rate": 0.0002, "epoch": 6.327077747989276, "step": 4720}, {"loss": 1.0299, "grad_norm": 1.1428112983703613, "learning_rate": 0.0002, "epoch": 6.340482573726542, "step": 4730}, {"loss": 1.0988, "grad_norm": 1.155534029006958, "learning_rate": 0.0002, "epoch": 6.353887399463807, "step": 4740}, {"loss": 1.1134, "grad_norm": 1.2997788190841675, "learning_rate": 0.0002, "epoch": 6.367292225201073, "step": 4750}, {"loss": 1.1386, "grad_norm": 1.1087043285369873, "learning_rate": 0.0002, "epoch": 6.380697050938338, "step": 4760}, {"loss": 1.0266, "grad_norm": 1.3957210779190063, "learning_rate": 0.0002, "epoch": 6.394101876675603, "step": 4770}, {"loss": 1.0803, "grad_norm": 1.1346395015716553, "learning_rate": 0.0002, "epoch": 6.407506702412869, "step": 4780}, {"loss": 1.0686, "grad_norm": 1.3830486536026, "learning_rate": 0.0002, "epoch": 6.420911528150134, "step": 4790}, {"loss": 1.138, "grad_norm": 1.1137559413909912, "learning_rate": 0.0002, "epoch": 6.434316353887399, "step": 4800}, {"loss": 1.0863, "grad_norm": 1.151821494102478, "learning_rate": 0.0002, "epoch": 6.447721179624665, "step": 4810}, {"loss": 1.0821, "grad_norm": 1.122589111328125, "learning_rate": 0.0002, "epoch": 6.461126005361931, "step": 4820}, {"loss": 1.1308, "grad_norm": 1.2847239971160889, "learning_rate": 0.0002, "epoch": 6.474530831099195, "step": 4830}, {"loss": 1.1001, "grad_norm": 1.027617335319519, "learning_rate": 0.0002, "epoch": 6.487935656836461, "step": 4840}, {"loss": 1.102, "grad_norm": 1.3375194072723389, "learning_rate": 0.0002, "epoch": 6.501340482573727, "step": 4850}, {"loss": 1.1055, "grad_norm": 1.1723220348358154, "learning_rate": 0.0002, "epoch": 6.514745308310992, "step": 4860}, {"loss": 1.129, "grad_norm": 1.7034224271774292, "learning_rate": 0.0002, "epoch": 6.528150134048257, "step": 4870}, {"loss": 1.0544, "grad_norm": 1.0840927362442017, "learning_rate": 0.0002, "epoch": 6.541554959785523, "step": 4880}, {"loss": 1.1194, "grad_norm": 1.3088481426239014, "learning_rate": 0.0002, "epoch": 6.554959785522788, "step": 4890}, {"loss": 1.1513, "grad_norm": 1.1394107341766357, "learning_rate": 0.0002, "epoch": 6.568364611260054, "step": 4900}, {"loss": 1.0796, "grad_norm": 1.0243184566497803, "learning_rate": 0.0002, "epoch": 6.581769436997319, "step": 4910}, {"loss": 1.2096, "grad_norm": 1.0814571380615234, "learning_rate": 0.0002, "epoch": 6.595174262734584, "step": 4920}, {"loss": 1.1279, "grad_norm": 1.1652323007583618, "learning_rate": 0.0002, "epoch": 6.60857908847185, "step": 4930}, {"loss": 1.186, "grad_norm": 1.0203579664230347, "learning_rate": 0.0002, "epoch": 6.621983914209116, "step": 4940}, {"loss": 1.1243, "grad_norm": 1.3823212385177612, "learning_rate": 0.0002, "epoch": 6.6353887399463805, "step": 4950}, {"loss": 1.1464, "grad_norm": 1.248955488204956, "learning_rate": 0.0002, "epoch": 6.648793565683646, "step": 4960}, {"loss": 1.1278, "grad_norm": 1.2215739488601685, "learning_rate": 0.0002, "epoch": 6.662198391420912, "step": 4970}, {"loss": 1.1109, "grad_norm": 1.307869553565979, "learning_rate": 0.0002, "epoch": 6.6756032171581765, "step": 4980}, {"loss": 1.1738, "grad_norm": 1.4434916973114014, "learning_rate": 0.0002, "epoch": 6.689008042895442, "step": 4990}, {"loss": 1.1068, "grad_norm": 1.1840227842330933, "learning_rate": 0.0002, "epoch": 6.702412868632708, "step": 5000}, {"loss": 1.1738, "grad_norm": 1.1775435209274292, "learning_rate": 0.0002, "epoch": 6.7158176943699734, "step": 5010}, {"loss": 1.114, "grad_norm": 1.1639968156814575, "learning_rate": 0.0002, "epoch": 6.729222520107239, "step": 5020}, {"loss": 1.1363, "grad_norm": 1.3774648904800415, "learning_rate": 0.0002, "epoch": 6.742627345844504, "step": 5030}, {"loss": 1.095, "grad_norm": 1.0328693389892578, "learning_rate": 0.0002, "epoch": 6.7560321715817695, "step": 5040}, {"loss": 1.1371, "grad_norm": 1.0495599508285522, "learning_rate": 0.0002, "epoch": 6.769436997319035, "step": 5050}, {"loss": 1.1728, "grad_norm": 1.3220133781433105, "learning_rate": 0.0002, "epoch": 6.7828418230563, "step": 5060}, {"loss": 1.13, "grad_norm": 1.3658279180526733, "learning_rate": 0.0002, "epoch": 6.7962466487935655, "step": 5070}, {"loss": 1.0755, "grad_norm": 1.3788504600524902, "learning_rate": 0.0002, "epoch": 6.809651474530831, "step": 5080}, {"loss": 1.1331, "grad_norm": 1.2342770099639893, "learning_rate": 0.0002, "epoch": 6.823056300268097, "step": 5090}, {"loss": 1.1761, "grad_norm": 1.3752578496932983, "learning_rate": 0.0002, "epoch": 6.836461126005362, "step": 5100}, {"loss": 1.078, "grad_norm": 1.0902243852615356, "learning_rate": 0.0002, "epoch": 6.849865951742627, "step": 5110}, {"loss": 1.1613, "grad_norm": 1.2125890254974365, "learning_rate": 0.0002, "epoch": 6.863270777479893, "step": 5120}, {"loss": 1.1651, "grad_norm": 1.2979270219802856, "learning_rate": 0.0002, "epoch": 6.8766756032171585, "step": 5130}, {"loss": 1.1207, "grad_norm": 1.2894749641418457, "learning_rate": 0.0002, "epoch": 6.890080428954423, "step": 5140}, {"loss": 1.1143, "grad_norm": 1.4804800748825073, "learning_rate": 0.0002, "epoch": 6.903485254691689, "step": 5150}, {"loss": 1.1245, "grad_norm": 1.1119170188903809, "learning_rate": 0.0002, "epoch": 6.916890080428955, "step": 5160}, {"loss": 1.1135, "grad_norm": 1.4991406202316284, "learning_rate": 0.0002, "epoch": 6.930294906166219, "step": 5170}, {"loss": 1.1025, "grad_norm": 1.2187672853469849, "learning_rate": 0.0002, "epoch": 6.943699731903485, "step": 5180}, {"loss": 1.1991, "grad_norm": 1.2419520616531372, "learning_rate": 0.0002, "epoch": 6.957104557640751, "step": 5190}, {"loss": 1.1231, "grad_norm": 1.359859585762024, "learning_rate": 0.0002, "epoch": 6.970509383378016, "step": 5200}, {"loss": 1.0882, "grad_norm": 1.3679486513137817, "learning_rate": 0.0002, "epoch": 6.983914209115282, "step": 5210}, {"loss": 1.1856, "grad_norm": 1.2109483480453491, "learning_rate": 0.0002, "epoch": 6.997319034852547, "step": 5220}]} +{"epoch": 8.0, "step": 5968, "epoch_duration": 2117.4552652835846, "total_accumulated_duration": 16460.21595597267, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75439453125}, "avg_memory_reserved": {"GPU_0": 14256.0}, "peak_memory_reserved": {"GPU_0": 15414.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-3514-sd-4/checkpoint-1492", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.5866, "grad_norm": 0.5006060004234314, "learning_rate": 0.0002, "epoch": 0.013404825737265416, "step": 10}, {"loss": 2.2758, "grad_norm": 0.895697832107544, "learning_rate": 0.0002, "epoch": 0.02680965147453083, "step": 20}, {"loss": 2.1106, "grad_norm": 0.4904654324054718, "learning_rate": 0.0002, "epoch": 0.040214477211796246, "step": 30}, {"loss": 1.9964, "grad_norm": 0.5587937831878662, "learning_rate": 0.0002, "epoch": 0.05361930294906166, "step": 40}, {"loss": 1.9997, "grad_norm": 0.46309754252433777, "learning_rate": 0.0002, "epoch": 0.06702412868632708, "step": 50}, {"loss": 1.9512, "grad_norm": 0.46663302183151245, "learning_rate": 0.0002, "epoch": 0.08042895442359249, "step": 60}, {"loss": 1.845, "grad_norm": 0.6435502171516418, "learning_rate": 0.0002, "epoch": 0.0938337801608579, "step": 70}, {"loss": 1.8528, "grad_norm": 0.46288377046585083, "learning_rate": 0.0002, "epoch": 0.10723860589812333, "step": 80}, {"loss": 1.8332, "grad_norm": 0.5226837396621704, "learning_rate": 0.0002, "epoch": 0.12064343163538874, "step": 90}, {"loss": 1.8706, "grad_norm": 1.190576195716858, "learning_rate": 0.0002, "epoch": 0.13404825737265416, "step": 100}, {"loss": 1.8465, "grad_norm": 0.4229426980018616, "learning_rate": 0.0002, "epoch": 0.14745308310991956, "step": 110}, {"loss": 1.8933, "grad_norm": 0.7448789477348328, "learning_rate": 0.0002, "epoch": 0.16085790884718498, "step": 120}, {"loss": 1.8377, "grad_norm": 0.3955472409725189, "learning_rate": 0.0002, "epoch": 0.1742627345844504, "step": 130}, {"loss": 1.8731, "grad_norm": 0.4333747327327728, "learning_rate": 0.0002, "epoch": 0.1876675603217158, "step": 140}, {"loss": 1.9102, "grad_norm": 0.4262531101703644, "learning_rate": 0.0002, "epoch": 0.20107238605898123, "step": 150}, {"loss": 1.8525, "grad_norm": 0.44875991344451904, "learning_rate": 0.0002, "epoch": 0.21447721179624665, "step": 160}, {"loss": 1.8104, "grad_norm": 0.39748692512512207, "learning_rate": 0.0002, "epoch": 0.22788203753351208, "step": 170}, {"loss": 1.8956, "grad_norm": 0.3995216488838196, "learning_rate": 0.0002, "epoch": 0.24128686327077747, "step": 180}, {"loss": 1.8166, "grad_norm": 0.4942905902862549, "learning_rate": 0.0002, "epoch": 0.2546916890080429, "step": 190}, {"loss": 1.8784, "grad_norm": 0.5456372499465942, "learning_rate": 0.0002, "epoch": 0.2680965147453083, "step": 200}, {"loss": 1.8204, "grad_norm": 0.42792096734046936, "learning_rate": 0.0002, "epoch": 0.28150134048257375, "step": 210}, {"loss": 1.8034, "grad_norm": 0.5114870667457581, "learning_rate": 0.0002, "epoch": 0.2949061662198391, "step": 220}, {"loss": 1.7965, "grad_norm": 0.41311749815940857, "learning_rate": 0.0002, "epoch": 0.30831099195710454, "step": 230}, {"loss": 1.8193, "grad_norm": 0.39651045203208923, "learning_rate": 0.0002, "epoch": 0.32171581769436997, "step": 240}, {"loss": 1.8806, "grad_norm": 0.3648274540901184, "learning_rate": 0.0002, "epoch": 0.3351206434316354, "step": 250}, {"loss": 1.7645, "grad_norm": 0.3815963566303253, "learning_rate": 0.0002, "epoch": 0.3485254691689008, "step": 260}, {"loss": 1.8385, "grad_norm": 0.4006984531879425, "learning_rate": 0.0002, "epoch": 0.36193029490616624, "step": 270}, {"loss": 1.8459, "grad_norm": 0.4043481647968292, "learning_rate": 0.0002, "epoch": 0.3753351206434316, "step": 280}, {"loss": 1.8551, "grad_norm": 0.37889420986175537, "learning_rate": 0.0002, "epoch": 0.38873994638069703, "step": 290}, {"loss": 1.8094, "grad_norm": 0.34378889203071594, "learning_rate": 0.0002, "epoch": 0.40214477211796246, "step": 300}, {"loss": 1.7489, "grad_norm": 0.3695462644100189, "learning_rate": 0.0002, "epoch": 0.4155495978552279, "step": 310}, {"loss": 1.7838, "grad_norm": 0.3820156753063202, "learning_rate": 0.0002, "epoch": 0.4289544235924933, "step": 320}, {"loss": 1.8432, "grad_norm": 0.4782438576221466, "learning_rate": 0.0002, "epoch": 0.44235924932975873, "step": 330}, {"loss": 1.8114, "grad_norm": 0.34293901920318604, "learning_rate": 0.0002, "epoch": 0.45576407506702415, "step": 340}, {"loss": 1.8255, "grad_norm": 0.34477704763412476, "learning_rate": 0.0002, "epoch": 0.4691689008042895, "step": 350}, {"loss": 1.7518, "grad_norm": 0.372482031583786, "learning_rate": 0.0002, "epoch": 0.48257372654155495, "step": 360}, {"loss": 1.7949, "grad_norm": 0.37152206897735596, "learning_rate": 0.0002, "epoch": 0.4959785522788204, "step": 370}, {"loss": 1.8622, "grad_norm": 0.3464239537715912, "learning_rate": 0.0002, "epoch": 0.5093833780160858, "step": 380}, {"loss": 1.7986, "grad_norm": 0.3936820328235626, "learning_rate": 0.0002, "epoch": 0.5227882037533512, "step": 390}, {"loss": 1.8422, "grad_norm": 0.4001905620098114, "learning_rate": 0.0002, "epoch": 0.5361930294906166, "step": 400}, {"loss": 1.889, "grad_norm": 0.3600618243217468, "learning_rate": 0.0002, "epoch": 0.5495978552278821, "step": 410}, {"loss": 1.7667, "grad_norm": 0.3735682964324951, "learning_rate": 0.0002, "epoch": 0.5630026809651475, "step": 420}, {"loss": 1.8039, "grad_norm": 0.34881851077079773, "learning_rate": 0.0002, "epoch": 0.5764075067024129, "step": 430}, {"loss": 1.8438, "grad_norm": 0.3512067496776581, "learning_rate": 0.0002, "epoch": 0.5898123324396782, "step": 440}, {"loss": 1.8021, "grad_norm": 0.42287155985832214, "learning_rate": 0.0002, "epoch": 0.6032171581769437, "step": 450}, {"loss": 1.8818, "grad_norm": 0.34132200479507446, "learning_rate": 0.0002, "epoch": 0.6166219839142091, "step": 460}, {"loss": 1.7515, "grad_norm": 0.345334529876709, "learning_rate": 0.0002, "epoch": 0.6300268096514745, "step": 470}, {"loss": 1.8632, "grad_norm": 0.363789826631546, "learning_rate": 0.0002, "epoch": 0.6434316353887399, "step": 480}, {"loss": 1.7783, "grad_norm": 0.33300429582595825, "learning_rate": 0.0002, "epoch": 0.6568364611260054, "step": 490}, {"loss": 1.8464, "grad_norm": 0.4159756600856781, "learning_rate": 0.0002, "epoch": 0.6702412868632708, "step": 500}, {"loss": 1.8082, "grad_norm": 0.3246348798274994, "learning_rate": 0.0002, "epoch": 0.6836461126005362, "step": 510}, {"loss": 1.8568, "grad_norm": 0.3838692307472229, "learning_rate": 0.0002, "epoch": 0.6970509383378016, "step": 520}, {"loss": 1.8308, "grad_norm": 0.3381868898868561, "learning_rate": 0.0002, "epoch": 0.710455764075067, "step": 530}, {"loss": 1.8174, "grad_norm": 0.34136253595352173, "learning_rate": 0.0002, "epoch": 0.7238605898123325, "step": 540}, {"loss": 1.7902, "grad_norm": 0.3476671576499939, "learning_rate": 0.0002, "epoch": 0.7372654155495979, "step": 550}, {"loss": 1.792, "grad_norm": 0.35285887122154236, "learning_rate": 0.0002, "epoch": 0.7506702412868632, "step": 560}, {"loss": 1.8588, "grad_norm": 0.3596920371055603, "learning_rate": 0.0002, "epoch": 0.7640750670241286, "step": 570}, {"loss": 1.8762, "grad_norm": 0.32715895771980286, "learning_rate": 0.0002, "epoch": 0.7774798927613941, "step": 580}, {"loss": 1.7703, "grad_norm": 0.34543490409851074, "learning_rate": 0.0002, "epoch": 0.7908847184986595, "step": 590}, {"loss": 1.747, "grad_norm": 0.37439998984336853, "learning_rate": 0.0002, "epoch": 0.8042895442359249, "step": 600}, {"loss": 1.8243, "grad_norm": 0.3491382300853729, "learning_rate": 0.0002, "epoch": 0.8176943699731903, "step": 610}, {"loss": 1.8925, "grad_norm": 0.34014254808425903, "learning_rate": 0.0002, "epoch": 0.8310991957104558, "step": 620}, {"loss": 1.7386, "grad_norm": 0.3297452926635742, "learning_rate": 0.0002, "epoch": 0.8445040214477212, "step": 630}, {"loss": 1.7946, "grad_norm": 0.3458525538444519, "learning_rate": 0.0002, "epoch": 0.8579088471849866, "step": 640}, {"loss": 1.7439, "grad_norm": 0.3545733392238617, "learning_rate": 0.0002, "epoch": 0.871313672922252, "step": 650}, {"loss": 1.7753, "grad_norm": 0.3864935040473938, "learning_rate": 0.0002, "epoch": 0.8847184986595175, "step": 660}, {"loss": 1.9012, "grad_norm": 0.35447531938552856, "learning_rate": 0.0002, "epoch": 0.8981233243967829, "step": 670}, {"loss": 1.8019, "grad_norm": 0.32028648257255554, "learning_rate": 0.0002, "epoch": 0.9115281501340483, "step": 680}, {"loss": 1.7813, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "epoch": 0.9249329758713136, "step": 690}, {"loss": 1.704, "grad_norm": 0.3581075072288513, "learning_rate": 0.0002, "epoch": 0.938337801608579, "step": 700}, {"loss": 1.7897, "grad_norm": 0.3576897978782654, "learning_rate": 0.0002, "epoch": 0.9517426273458445, "step": 710}, {"loss": 1.7086, "grad_norm": 0.33551549911499023, "learning_rate": 0.0002, "epoch": 0.9651474530831099, "step": 720}, {"loss": 1.6907, "grad_norm": 0.39297860860824585, "learning_rate": 0.0002, "epoch": 0.9785522788203753, "step": 730}, {"loss": 1.7941, "grad_norm": 0.3467773199081421, "learning_rate": 0.0002, "epoch": 0.9919571045576407, "step": 740}, {"eval_loss": 1.8168668746948242, "eval_runtime": 90.6336, "eval_samples_per_second": 5.682, "eval_steps_per_second": 0.717, "epoch": 1.0, "step": 746}, {"loss": 1.7741, "grad_norm": 0.2998153269290924, "learning_rate": 0.0002, "epoch": 1.0053619302949062, "step": 750}, {"loss": 1.7897, "grad_norm": 0.34353747963905334, "learning_rate": 0.0002, "epoch": 1.0187667560321716, "step": 760}, {"loss": 1.6997, "grad_norm": 0.3506847321987152, "learning_rate": 0.0002, "epoch": 1.032171581769437, "step": 770}, {"loss": 1.7277, "grad_norm": 0.3434218764305115, "learning_rate": 0.0002, "epoch": 1.0455764075067024, "step": 780}, {"loss": 1.7201, "grad_norm": 0.39283573627471924, "learning_rate": 0.0002, "epoch": 1.0589812332439679, "step": 790}, {"loss": 1.7134, "grad_norm": 0.36534103751182556, "learning_rate": 0.0002, "epoch": 1.0723860589812333, "step": 800}, {"loss": 1.73, "grad_norm": 0.32713210582733154, "learning_rate": 0.0002, "epoch": 1.0857908847184987, "step": 810}, {"loss": 1.733, "grad_norm": 0.4298870861530304, "learning_rate": 0.0002, "epoch": 1.0991957104557641, "step": 820}, {"loss": 1.7152, "grad_norm": 0.3652895987033844, "learning_rate": 0.0002, "epoch": 1.1126005361930296, "step": 830}, {"loss": 1.7952, "grad_norm": 0.4341593086719513, "learning_rate": 0.0002, "epoch": 1.126005361930295, "step": 840}, {"loss": 1.7353, "grad_norm": 0.3925093412399292, "learning_rate": 0.0002, "epoch": 1.1394101876675604, "step": 850}, {"loss": 1.7484, "grad_norm": 0.3695056736469269, "learning_rate": 0.0002, "epoch": 1.1528150134048256, "step": 860}, {"loss": 1.7959, "grad_norm": 0.36138468980789185, "learning_rate": 0.0002, "epoch": 1.1662198391420913, "step": 870}, {"loss": 1.7144, "grad_norm": 0.33074072003364563, "learning_rate": 0.0002, "epoch": 1.1796246648793565, "step": 880}, {"loss": 1.7303, "grad_norm": 0.3552579879760742, "learning_rate": 0.0002, "epoch": 1.193029490616622, "step": 890}, {"loss": 1.6857, "grad_norm": 0.38744238018989563, "learning_rate": 0.0002, "epoch": 1.2064343163538873, "step": 900}, {"loss": 1.7543, "grad_norm": 0.3563305735588074, "learning_rate": 0.0002, "epoch": 1.2198391420911527, "step": 910}, {"loss": 1.7406, "grad_norm": 0.35686084628105164, "learning_rate": 0.0002, "epoch": 1.2332439678284182, "step": 920}, {"loss": 1.765, "grad_norm": 0.4001927077770233, "learning_rate": 0.0002, "epoch": 1.2466487935656836, "step": 930}, {"loss": 1.7147, "grad_norm": 0.35909149050712585, "learning_rate": 0.0002, "epoch": 1.260053619302949, "step": 940}, {"loss": 1.6712, "grad_norm": 0.35123375058174133, "learning_rate": 0.0002, "epoch": 1.2734584450402144, "step": 950}, {"loss": 1.7245, "grad_norm": 0.38013333082199097, "learning_rate": 0.0002, "epoch": 1.2868632707774799, "step": 960}, {"loss": 1.7395, "grad_norm": 0.373146653175354, "learning_rate": 0.0002, "epoch": 1.3002680965147453, "step": 970}, {"loss": 1.707, "grad_norm": 0.4208183288574219, "learning_rate": 0.0002, "epoch": 1.3136729222520107, "step": 980}, {"loss": 1.7122, "grad_norm": 0.3613564074039459, "learning_rate": 0.0002, "epoch": 1.3270777479892761, "step": 990}, {"loss": 1.6776, "grad_norm": 0.34058499336242676, "learning_rate": 0.0002, "epoch": 1.3404825737265416, "step": 1000}, {"loss": 1.7072, "grad_norm": 0.3563075065612793, "learning_rate": 0.0002, "epoch": 1.353887399463807, "step": 1010}, {"loss": 1.7167, "grad_norm": 0.36920854449272156, "learning_rate": 0.0002, "epoch": 1.3672922252010724, "step": 1020}, {"loss": 1.7143, "grad_norm": 0.3889519274234772, "learning_rate": 0.0002, "epoch": 1.3806970509383378, "step": 1030}, {"loss": 1.8023, "grad_norm": 0.3664555251598358, "learning_rate": 0.0002, "epoch": 1.3941018766756033, "step": 1040}, {"loss": 1.7961, "grad_norm": 0.38175567984580994, "learning_rate": 0.0002, "epoch": 1.4075067024128687, "step": 1050}, {"loss": 1.7363, "grad_norm": 0.42346763610839844, "learning_rate": 0.0002, "epoch": 1.420911528150134, "step": 1060}, {"loss": 1.708, "grad_norm": 0.3456033170223236, "learning_rate": 0.0002, "epoch": 1.4343163538873995, "step": 1070}, {"loss": 1.6846, "grad_norm": 0.38931941986083984, "learning_rate": 0.0002, "epoch": 1.447721179624665, "step": 1080}, {"loss": 1.7416, "grad_norm": 0.5473279356956482, "learning_rate": 0.0002, "epoch": 1.4611260053619302, "step": 1090}, {"loss": 1.6927, "grad_norm": 0.3517422676086426, "learning_rate": 0.0002, "epoch": 1.4745308310991958, "step": 1100}, {"loss": 1.7213, "grad_norm": 0.3511943221092224, "learning_rate": 0.0002, "epoch": 1.487935656836461, "step": 1110}, {"loss": 1.7947, "grad_norm": 0.3762837052345276, "learning_rate": 0.0002, "epoch": 1.5013404825737267, "step": 1120}, {"loss": 1.6893, "grad_norm": 0.37149128317832947, "learning_rate": 0.0002, "epoch": 1.5147453083109919, "step": 1130}, {"loss": 1.6944, "grad_norm": 0.3945842981338501, "learning_rate": 0.0002, "epoch": 1.5281501340482575, "step": 1140}, {"loss": 1.7254, "grad_norm": 0.40258195996284485, "learning_rate": 0.0002, "epoch": 1.5415549597855227, "step": 1150}, {"loss": 1.6798, "grad_norm": 0.3959120213985443, "learning_rate": 0.0002, "epoch": 1.5549597855227884, "step": 1160}, {"loss": 1.7789, "grad_norm": 0.37792712450027466, "learning_rate": 0.0002, "epoch": 1.5683646112600536, "step": 1170}, {"loss": 1.7953, "grad_norm": 0.4019201099872589, "learning_rate": 0.0002, "epoch": 1.5817694369973192, "step": 1180}, {"loss": 1.6887, "grad_norm": 0.40712273120880127, "learning_rate": 0.0002, "epoch": 1.5951742627345844, "step": 1190}, {"loss": 1.7131, "grad_norm": 0.4131423234939575, "learning_rate": 0.0002, "epoch": 1.6085790884718498, "step": 1200}, {"loss": 1.6757, "grad_norm": 0.3738194704055786, "learning_rate": 0.0002, "epoch": 1.6219839142091153, "step": 1210}, {"loss": 1.7629, "grad_norm": 0.3987765908241272, "learning_rate": 0.0002, "epoch": 1.6353887399463807, "step": 1220}, {"loss": 1.7374, "grad_norm": 0.34117406606674194, "learning_rate": 0.0002, "epoch": 1.648793565683646, "step": 1230}, {"loss": 1.7869, "grad_norm": 0.34900516271591187, "learning_rate": 0.0002, "epoch": 1.6621983914209115, "step": 1240}, {"loss": 1.7162, "grad_norm": 0.35759788751602173, "learning_rate": 0.0002, "epoch": 1.675603217158177, "step": 1250}, {"loss": 1.7697, "grad_norm": 0.3837822377681732, "learning_rate": 0.0002, "epoch": 1.6890080428954424, "step": 1260}, {"loss": 1.7972, "grad_norm": 0.3671180307865143, "learning_rate": 0.0002, "epoch": 1.7024128686327078, "step": 1270}, {"loss": 1.7198, "grad_norm": 0.4124658703804016, "learning_rate": 0.0002, "epoch": 1.7158176943699732, "step": 1280}, {"loss": 1.8006, "grad_norm": 0.39059901237487793, "learning_rate": 0.0002, "epoch": 1.7292225201072386, "step": 1290}, {"loss": 1.7721, "grad_norm": 0.4006287157535553, "learning_rate": 0.0002, "epoch": 1.742627345844504, "step": 1300}, {"loss": 1.8196, "grad_norm": 0.3606216013431549, "learning_rate": 0.0002, "epoch": 1.7560321715817695, "step": 1310}, {"loss": 1.7213, "grad_norm": 0.3861924111843109, "learning_rate": 0.0002, "epoch": 1.7694369973190347, "step": 1320}, {"loss": 1.7849, "grad_norm": 0.41432589292526245, "learning_rate": 0.0002, "epoch": 1.7828418230563003, "step": 1330}, {"loss": 1.7069, "grad_norm": 0.3751705586910248, "learning_rate": 0.0002, "epoch": 1.7962466487935655, "step": 1340}, {"loss": 1.717, "grad_norm": 0.36217355728149414, "learning_rate": 0.0002, "epoch": 1.8096514745308312, "step": 1350}, {"loss": 1.7878, "grad_norm": 0.35937434434890747, "learning_rate": 0.0002, "epoch": 1.8230563002680964, "step": 1360}, {"loss": 1.7026, "grad_norm": 0.36120304465293884, "learning_rate": 0.0002, "epoch": 1.836461126005362, "step": 1370}, {"loss": 1.7378, "grad_norm": 0.36082401871681213, "learning_rate": 0.0002, "epoch": 1.8498659517426272, "step": 1380}, {"loss": 1.6938, "grad_norm": 0.3616413176059723, "learning_rate": 0.0002, "epoch": 1.863270777479893, "step": 1390}, {"loss": 1.6998, "grad_norm": 0.3664911091327667, "learning_rate": 0.0002, "epoch": 1.876675603217158, "step": 1400}, {"loss": 1.7548, "grad_norm": 0.3545122444629669, "learning_rate": 0.0002, "epoch": 1.8900804289544237, "step": 1410}, {"loss": 1.727, "grad_norm": 0.38186976313591003, "learning_rate": 0.0002, "epoch": 1.903485254691689, "step": 1420}, {"loss": 1.788, "grad_norm": 0.41099944710731506, "learning_rate": 0.0002, "epoch": 1.9168900804289544, "step": 1430}, {"loss": 1.7377, "grad_norm": 0.34538620710372925, "learning_rate": 0.0002, "epoch": 1.9302949061662198, "step": 1440}, {"loss": 1.7349, "grad_norm": 0.35443663597106934, "learning_rate": 0.0002, "epoch": 1.9436997319034852, "step": 1450}, {"loss": 1.7457, "grad_norm": 0.4783519208431244, "learning_rate": 0.0002, "epoch": 1.9571045576407506, "step": 1460}, {"loss": 1.7073, "grad_norm": 0.36285310983657837, "learning_rate": 0.0002, "epoch": 1.970509383378016, "step": 1470}, {"loss": 1.7607, "grad_norm": 0.361730694770813, "learning_rate": 0.0002, "epoch": 1.9839142091152815, "step": 1480}, {"loss": 1.7133, "grad_norm": 0.38347867131233215, "learning_rate": 0.0002, "epoch": 1.997319034852547, "step": 1490}, {"eval_loss": 1.8150336742401123, "eval_runtime": 91.1797, "eval_samples_per_second": 5.648, "eval_steps_per_second": 0.713, "epoch": 2.0, "step": 1492}, {"loss": 1.6673, "grad_norm": 0.3648935854434967, "learning_rate": 0.0002, "epoch": 2.0107238605898123, "step": 1500}, {"loss": 1.6754, "grad_norm": 0.3521469533443451, "learning_rate": 0.0002, "epoch": 2.0241286863270775, "step": 1510}, {"loss": 1.5775, "grad_norm": 0.4275520145893097, "learning_rate": 0.0002, "epoch": 2.037533512064343, "step": 1520}, {"loss": 1.5932, "grad_norm": 0.4140888750553131, "learning_rate": 0.0002, "epoch": 2.0509383378016084, "step": 1530}, {"loss": 1.6237, "grad_norm": 0.37715452909469604, "learning_rate": 0.0002, "epoch": 2.064343163538874, "step": 1540}, {"loss": 1.6426, "grad_norm": 0.4375513195991516, "learning_rate": 0.0002, "epoch": 2.0777479892761392, "step": 1550}, {"loss": 1.6675, "grad_norm": 0.44963088631629944, "learning_rate": 0.0002, "epoch": 2.091152815013405, "step": 1560}, {"loss": 1.6731, "grad_norm": 0.45463916659355164, "learning_rate": 0.0002, "epoch": 2.10455764075067, "step": 1570}, {"loss": 1.5928, "grad_norm": 0.3952806293964386, "learning_rate": 0.0002, "epoch": 2.1179624664879357, "step": 1580}, {"loss": 1.6153, "grad_norm": 0.44873616099357605, "learning_rate": 0.0002, "epoch": 2.131367292225201, "step": 1590}, {"loss": 1.5953, "grad_norm": 0.45529067516326904, "learning_rate": 0.0002, "epoch": 2.1447721179624666, "step": 1600}, {"loss": 1.634, "grad_norm": 0.4483625590801239, "learning_rate": 0.0002, "epoch": 2.158176943699732, "step": 1610}, {"loss": 1.6202, "grad_norm": 0.3954690992832184, "learning_rate": 0.0002, "epoch": 2.1715817694369974, "step": 1620}, {"loss": 1.6657, "grad_norm": 0.4297006130218506, "learning_rate": 0.0002, "epoch": 2.1849865951742626, "step": 1630}, {"loss": 1.5499, "grad_norm": 0.4121869206428528, "learning_rate": 0.0002, "epoch": 2.1983914209115283, "step": 1640}, {"loss": 1.6017, "grad_norm": 0.45843517780303955, "learning_rate": 0.0002, "epoch": 2.2117962466487935, "step": 1650}, {"loss": 1.6699, "grad_norm": 0.44742295145988464, "learning_rate": 0.0002, "epoch": 2.225201072386059, "step": 1660}, {"loss": 1.6879, "grad_norm": 0.500198483467102, "learning_rate": 0.0002, "epoch": 2.2386058981233243, "step": 1670}, {"loss": 1.6362, "grad_norm": 0.4322265386581421, "learning_rate": 0.0002, "epoch": 2.25201072386059, "step": 1680}, {"loss": 1.6486, "grad_norm": 0.480289101600647, "learning_rate": 0.0002, "epoch": 2.265415549597855, "step": 1690}, {"loss": 1.6396, "grad_norm": 0.4532500207424164, "learning_rate": 0.0002, "epoch": 2.278820375335121, "step": 1700}, {"loss": 1.6088, "grad_norm": 0.41848474740982056, "learning_rate": 0.0002, "epoch": 2.292225201072386, "step": 1710}, {"loss": 1.6447, "grad_norm": 0.47211962938308716, "learning_rate": 0.0002, "epoch": 2.3056300268096512, "step": 1720}, {"loss": 1.7174, "grad_norm": 0.4273032248020172, "learning_rate": 0.0002, "epoch": 2.319034852546917, "step": 1730}, {"loss": 1.617, "grad_norm": 0.4660373330116272, "learning_rate": 0.0002, "epoch": 2.3324396782841825, "step": 1740}, {"loss": 1.6036, "grad_norm": 0.4409862756729126, "learning_rate": 0.0002, "epoch": 2.3458445040214477, "step": 1750}, {"loss": 1.6579, "grad_norm": 0.44795849919319153, "learning_rate": 0.0002, "epoch": 2.359249329758713, "step": 1760}, {"loss": 1.5736, "grad_norm": 0.4470100402832031, "learning_rate": 0.0002, "epoch": 2.3726541554959786, "step": 1770}, {"loss": 1.6277, "grad_norm": 0.4184521436691284, "learning_rate": 0.0002, "epoch": 2.386058981233244, "step": 1780}, {"loss": 1.6654, "grad_norm": 0.4572308659553528, "learning_rate": 0.0002, "epoch": 2.3994638069705094, "step": 1790}, {"loss": 1.6714, "grad_norm": 0.4888782501220703, "learning_rate": 0.0002, "epoch": 2.4128686327077746, "step": 1800}, {"loss": 1.7168, "grad_norm": 0.4442083239555359, "learning_rate": 0.0002, "epoch": 2.4262734584450403, "step": 1810}, {"loss": 1.6375, "grad_norm": 0.4986329972743988, "learning_rate": 0.0002, "epoch": 2.4396782841823055, "step": 1820}, {"loss": 1.6881, "grad_norm": 0.47918054461479187, "learning_rate": 0.0002, "epoch": 2.453083109919571, "step": 1830}, {"loss": 1.5969, "grad_norm": 0.42569679021835327, "learning_rate": 0.0002, "epoch": 2.4664879356568363, "step": 1840}, {"loss": 1.5751, "grad_norm": 0.4683821201324463, "learning_rate": 0.0002, "epoch": 2.479892761394102, "step": 1850}, {"loss": 1.6004, "grad_norm": 0.43605074286460876, "learning_rate": 0.0002, "epoch": 2.493297587131367, "step": 1860}, {"loss": 1.6885, "grad_norm": 0.4189167618751526, "learning_rate": 0.0002, "epoch": 2.506702412868633, "step": 1870}, {"loss": 1.6493, "grad_norm": 0.5860861539840698, "learning_rate": 0.0002, "epoch": 2.520107238605898, "step": 1880}, {"loss": 1.6563, "grad_norm": 0.4568740427494049, "learning_rate": 0.0002, "epoch": 2.5335120643431637, "step": 1890}, {"loss": 1.6653, "grad_norm": 0.4672846496105194, "learning_rate": 0.0002, "epoch": 2.546916890080429, "step": 1900}, {"loss": 1.6037, "grad_norm": 0.4280472993850708, "learning_rate": 0.0002, "epoch": 2.5603217158176945, "step": 1910}, {"loss": 1.5721, "grad_norm": 0.590728759765625, "learning_rate": 0.0002, "epoch": 2.5737265415549597, "step": 1920}, {"loss": 1.6567, "grad_norm": 0.4205126166343689, "learning_rate": 0.0002, "epoch": 2.5871313672922254, "step": 1930}, {"loss": 1.5045, "grad_norm": 0.47869905829429626, "learning_rate": 0.0002, "epoch": 2.6005361930294906, "step": 1940}, {"loss": 1.5973, "grad_norm": 0.4607323408126831, "learning_rate": 0.0002, "epoch": 2.6139410187667558, "step": 1950}, {"loss": 1.644, "grad_norm": 0.4762210547924042, "learning_rate": 0.0002, "epoch": 2.6273458445040214, "step": 1960}, {"loss": 1.6316, "grad_norm": 0.46832647919654846, "learning_rate": 0.0002, "epoch": 2.640750670241287, "step": 1970}, {"loss": 1.6591, "grad_norm": 0.4368574619293213, "learning_rate": 0.0002, "epoch": 2.6541554959785523, "step": 1980}, {"loss": 1.6359, "grad_norm": 0.5248273611068726, "learning_rate": 0.0002, "epoch": 2.6675603217158175, "step": 1990}, {"loss": 1.6879, "grad_norm": 0.46777117252349854, "learning_rate": 0.0002, "epoch": 2.680965147453083, "step": 2000}, {"loss": 1.7248, "grad_norm": 0.5201858878135681, "learning_rate": 0.0002, "epoch": 2.6943699731903488, "step": 2010}, {"loss": 1.6337, "grad_norm": 0.46777284145355225, "learning_rate": 0.0002, "epoch": 2.707774798927614, "step": 2020}, {"loss": 1.6369, "grad_norm": 0.46736642718315125, "learning_rate": 0.0002, "epoch": 2.721179624664879, "step": 2030}, {"loss": 1.6356, "grad_norm": 0.4647925794124603, "learning_rate": 0.0002, "epoch": 2.734584450402145, "step": 2040}, {"loss": 1.732, "grad_norm": 0.4298803508281708, "learning_rate": 0.0002, "epoch": 2.7479892761394105, "step": 2050}, {"loss": 1.6648, "grad_norm": 0.45485609769821167, "learning_rate": 0.0002, "epoch": 2.7613941018766757, "step": 2060}, {"loss": 1.6706, "grad_norm": 0.43687865138053894, "learning_rate": 0.0002, "epoch": 2.774798927613941, "step": 2070}, {"loss": 1.6904, "grad_norm": 0.4319164752960205, "learning_rate": 0.0002, "epoch": 2.7882037533512065, "step": 2080}, {"loss": 1.6531, "grad_norm": 0.47792428731918335, "learning_rate": 0.0002, "epoch": 2.8016085790884717, "step": 2090}, {"loss": 1.6417, "grad_norm": 0.5322234034538269, "learning_rate": 0.0002, "epoch": 2.8150134048257374, "step": 2100}, {"loss": 1.6634, "grad_norm": 0.47517943382263184, "learning_rate": 0.0002, "epoch": 2.8284182305630026, "step": 2110}, {"loss": 1.6329, "grad_norm": 0.45799025893211365, "learning_rate": 0.0002, "epoch": 2.841823056300268, "step": 2120}, {"loss": 1.6594, "grad_norm": 0.45852357149124146, "learning_rate": 0.0002, "epoch": 2.8552278820375334, "step": 2130}, {"loss": 1.61, "grad_norm": 0.4617408514022827, "learning_rate": 0.0002, "epoch": 2.868632707774799, "step": 2140}, {"loss": 1.6445, "grad_norm": 0.44205963611602783, "learning_rate": 0.0002, "epoch": 2.8820375335120643, "step": 2150}, {"loss": 1.6231, "grad_norm": 0.47173425555229187, "learning_rate": 0.0002, "epoch": 2.89544235924933, "step": 2160}, {"loss": 1.6425, "grad_norm": 0.46379899978637695, "learning_rate": 0.0002, "epoch": 2.908847184986595, "step": 2170}, {"loss": 1.6403, "grad_norm": 0.4999759793281555, "learning_rate": 0.0002, "epoch": 2.9222520107238603, "step": 2180}, {"loss": 1.6741, "grad_norm": 0.4607947766780853, "learning_rate": 0.0002, "epoch": 2.935656836461126, "step": 2190}, {"loss": 1.6889, "grad_norm": 0.4359836280345917, "learning_rate": 0.0002, "epoch": 2.9490616621983916, "step": 2200}, {"loss": 1.6478, "grad_norm": 0.5195549726486206, "learning_rate": 0.0002, "epoch": 2.962466487935657, "step": 2210}, {"loss": 1.6348, "grad_norm": 0.4914056062698364, "learning_rate": 0.0002, "epoch": 2.975871313672922, "step": 2220}, {"loss": 1.6594, "grad_norm": 0.4647377133369446, "learning_rate": 0.0002, "epoch": 2.9892761394101877, "step": 2230}, {"eval_loss": 1.8368606567382812, "eval_runtime": 90.5623, "eval_samples_per_second": 5.687, "eval_steps_per_second": 0.718, "epoch": 3.0, "step": 2238}, {"loss": 1.5704, "grad_norm": 0.40689945220947266, "learning_rate": 0.0002, "epoch": 3.002680965147453, "step": 2240}, {"loss": 1.5961, "grad_norm": 0.4699273705482483, "learning_rate": 0.0002, "epoch": 3.0160857908847185, "step": 2250}, {"loss": 1.5182, "grad_norm": 0.5531830787658691, "learning_rate": 0.0002, "epoch": 3.0294906166219837, "step": 2260}, {"loss": 1.4924, "grad_norm": 0.5441790223121643, "learning_rate": 0.0002, "epoch": 3.0428954423592494, "step": 2270}, {"loss": 1.4953, "grad_norm": 0.6145012974739075, "learning_rate": 0.0002, "epoch": 3.0563002680965146, "step": 2280}, {"loss": 1.4861, "grad_norm": 0.6997102499008179, "learning_rate": 0.0002, "epoch": 3.06970509383378, "step": 2290}, {"loss": 1.5853, "grad_norm": 0.6082330942153931, "learning_rate": 0.0002, "epoch": 3.0831099195710454, "step": 2300}, {"loss": 1.5377, "grad_norm": 0.5294155478477478, "learning_rate": 0.0002, "epoch": 3.096514745308311, "step": 2310}, {"loss": 1.5452, "grad_norm": 0.7200340032577515, "learning_rate": 0.0002, "epoch": 3.1099195710455763, "step": 2320}, {"loss": 1.5296, "grad_norm": 0.721092939376831, "learning_rate": 0.0002, "epoch": 3.123324396782842, "step": 2330}, {"loss": 1.5307, "grad_norm": 0.5344305038452148, "learning_rate": 0.0002, "epoch": 3.136729222520107, "step": 2340}, {"loss": 1.4347, "grad_norm": 0.5533145070075989, "learning_rate": 0.0002, "epoch": 3.1501340482573728, "step": 2350}, {"loss": 1.529, "grad_norm": 0.5976856350898743, "learning_rate": 0.0002, "epoch": 3.163538873994638, "step": 2360}, {"loss": 1.6044, "grad_norm": 0.4974960386753082, "learning_rate": 0.0002, "epoch": 3.1769436997319036, "step": 2370}, {"loss": 1.5554, "grad_norm": 0.6377840042114258, "learning_rate": 0.0002, "epoch": 3.190348525469169, "step": 2380}, {"loss": 1.5322, "grad_norm": 0.5447293519973755, "learning_rate": 0.0002, "epoch": 3.2037533512064345, "step": 2390}, {"loss": 1.5127, "grad_norm": 0.49577030539512634, "learning_rate": 0.0002, "epoch": 3.2171581769436997, "step": 2400}, {"loss": 1.4768, "grad_norm": 0.5588275790214539, "learning_rate": 0.0002, "epoch": 3.2305630026809653, "step": 2410}, {"loss": 1.4755, "grad_norm": 0.6429149508476257, "learning_rate": 0.0002, "epoch": 3.2439678284182305, "step": 2420}, {"loss": 1.5596, "grad_norm": 0.5713154673576355, "learning_rate": 0.0002, "epoch": 3.257372654155496, "step": 2430}, {"loss": 1.4763, "grad_norm": 0.6348955035209656, "learning_rate": 0.0002, "epoch": 3.2707774798927614, "step": 2440}, {"loss": 1.509, "grad_norm": 0.5675528645515442, "learning_rate": 0.0002, "epoch": 3.284182305630027, "step": 2450}, {"loss": 1.5867, "grad_norm": 0.5570188164710999, "learning_rate": 0.0002, "epoch": 3.297587131367292, "step": 2460}, {"loss": 1.554, "grad_norm": 0.6029602289199829, "learning_rate": 0.0002, "epoch": 3.310991957104558, "step": 2470}, {"loss": 1.5094, "grad_norm": 0.523206353187561, "learning_rate": 0.0002, "epoch": 3.324396782841823, "step": 2480}, {"loss": 1.4854, "grad_norm": 0.5912408828735352, "learning_rate": 0.0002, "epoch": 3.3378016085790883, "step": 2490}, {"loss": 1.5097, "grad_norm": 0.5524865984916687, "learning_rate": 0.0002, "epoch": 3.351206434316354, "step": 2500}, {"loss": 1.5064, "grad_norm": 0.60386061668396, "learning_rate": 0.0002, "epoch": 3.3646112600536195, "step": 2510}, {"loss": 1.564, "grad_norm": 0.5838595628738403, "learning_rate": 0.0002, "epoch": 3.3780160857908847, "step": 2520}, {"loss": 1.4615, "grad_norm": 0.5400974154472351, "learning_rate": 0.0002, "epoch": 3.39142091152815, "step": 2530}, {"loss": 1.5349, "grad_norm": 0.6150162220001221, "learning_rate": 0.0002, "epoch": 3.4048257372654156, "step": 2540}, {"loss": 1.5978, "grad_norm": 0.5279412269592285, "learning_rate": 0.0002, "epoch": 3.418230563002681, "step": 2550}, {"loss": 1.5063, "grad_norm": 0.5974063873291016, "learning_rate": 0.0002, "epoch": 3.4316353887399464, "step": 2560}, {"loss": 1.5825, "grad_norm": 0.661573052406311, "learning_rate": 0.0002, "epoch": 3.4450402144772116, "step": 2570}, {"loss": 1.5204, "grad_norm": 0.577880322933197, "learning_rate": 0.0002, "epoch": 3.4584450402144773, "step": 2580}, {"loss": 1.5295, "grad_norm": 0.5532318949699402, "learning_rate": 0.0002, "epoch": 3.4718498659517425, "step": 2590}, {"loss": 1.4933, "grad_norm": 0.5764921307563782, "learning_rate": 0.0002, "epoch": 3.485254691689008, "step": 2600}, {"loss": 1.4355, "grad_norm": 0.6145682334899902, "learning_rate": 0.0002, "epoch": 3.4986595174262733, "step": 2610}, {"loss": 1.4968, "grad_norm": 0.6561126112937927, "learning_rate": 0.0002, "epoch": 3.512064343163539, "step": 2620}, {"loss": 1.5309, "grad_norm": 0.5673288106918335, "learning_rate": 0.0002, "epoch": 3.525469168900804, "step": 2630}, {"loss": 1.5274, "grad_norm": 0.6215338706970215, "learning_rate": 0.0002, "epoch": 3.53887399463807, "step": 2640}, {"loss": 1.5117, "grad_norm": 0.5512040853500366, "learning_rate": 0.0002, "epoch": 3.552278820375335, "step": 2650}, {"loss": 1.5188, "grad_norm": 0.49503496289253235, "learning_rate": 0.0002, "epoch": 3.5656836461126007, "step": 2660}, {"loss": 1.524, "grad_norm": 0.5714912414550781, "learning_rate": 0.0002, "epoch": 3.579088471849866, "step": 2670}, {"loss": 1.4651, "grad_norm": 0.6883154511451721, "learning_rate": 0.0002, "epoch": 3.592493297587131, "step": 2680}, {"loss": 1.5174, "grad_norm": 0.5989556908607483, "learning_rate": 0.0002, "epoch": 3.6058981233243967, "step": 2690}, {"loss": 1.5335, "grad_norm": 0.630268394947052, "learning_rate": 0.0002, "epoch": 3.6193029490616624, "step": 2700}, {"loss": 1.4681, "grad_norm": 0.5819358229637146, "learning_rate": 0.0002, "epoch": 3.6327077747989276, "step": 2710}, {"loss": 1.5676, "grad_norm": 0.6102097034454346, "learning_rate": 0.0002, "epoch": 3.646112600536193, "step": 2720}, {"loss": 1.5566, "grad_norm": 0.6858501434326172, "learning_rate": 0.0002, "epoch": 3.6595174262734584, "step": 2730}, {"loss": 1.5242, "grad_norm": 0.6328608393669128, "learning_rate": 0.0002, "epoch": 3.672922252010724, "step": 2740}, {"loss": 1.5211, "grad_norm": 0.5366981029510498, "learning_rate": 0.0002, "epoch": 3.6863270777479893, "step": 2750}, {"loss": 1.5532, "grad_norm": 0.7048938274383545, "learning_rate": 0.0002, "epoch": 3.6997319034852545, "step": 2760}, {"loss": 1.5001, "grad_norm": 0.5371938347816467, "learning_rate": 0.0002, "epoch": 3.71313672922252, "step": 2770}, {"loss": 1.557, "grad_norm": 0.6142212152481079, "learning_rate": 0.0002, "epoch": 3.726541554959786, "step": 2780}, {"loss": 1.5191, "grad_norm": 0.6164522171020508, "learning_rate": 0.0002, "epoch": 3.739946380697051, "step": 2790}, {"loss": 1.5071, "grad_norm": 0.7511836886405945, "learning_rate": 0.0002, "epoch": 3.753351206434316, "step": 2800}, {"loss": 1.5775, "grad_norm": 0.6194717288017273, "learning_rate": 0.0002, "epoch": 3.766756032171582, "step": 2810}, {"loss": 1.5721, "grad_norm": 0.676721453666687, "learning_rate": 0.0002, "epoch": 3.780160857908847, "step": 2820}, {"loss": 1.502, "grad_norm": 0.5646911263465881, "learning_rate": 0.0002, "epoch": 3.7935656836461127, "step": 2830}, {"loss": 1.4871, "grad_norm": 0.5874826908111572, "learning_rate": 0.0002, "epoch": 3.806970509383378, "step": 2840}, {"loss": 1.5046, "grad_norm": 0.6395232677459717, "learning_rate": 0.0002, "epoch": 3.8203753351206435, "step": 2850}, {"loss": 1.5088, "grad_norm": 0.624563992023468, "learning_rate": 0.0002, "epoch": 3.8337801608579087, "step": 2860}, {"loss": 1.479, "grad_norm": 0.59019935131073, "learning_rate": 0.0002, "epoch": 3.8471849865951744, "step": 2870}, {"loss": 1.4693, "grad_norm": 0.6700479984283447, "learning_rate": 0.0002, "epoch": 3.8605898123324396, "step": 2880}, {"loss": 1.5032, "grad_norm": 0.6131282448768616, "learning_rate": 0.0002, "epoch": 3.8739946380697052, "step": 2890}, {"loss": 1.5446, "grad_norm": 0.6807777881622314, "learning_rate": 0.0002, "epoch": 3.8873994638069704, "step": 2900}, {"loss": 1.5618, "grad_norm": 0.5297217965126038, "learning_rate": 0.0002, "epoch": 3.900804289544236, "step": 2910}, {"loss": 1.5046, "grad_norm": 0.5795540809631348, "learning_rate": 0.0002, "epoch": 3.9142091152815013, "step": 2920}, {"loss": 1.5155, "grad_norm": 0.5549747347831726, "learning_rate": 0.0002, "epoch": 3.927613941018767, "step": 2930}, {"loss": 1.5932, "grad_norm": 0.5895092487335205, "learning_rate": 0.0002, "epoch": 3.941018766756032, "step": 2940}, {"loss": 1.5831, "grad_norm": 0.590002715587616, "learning_rate": 0.0002, "epoch": 3.9544235924932973, "step": 2950}, {"loss": 1.592, "grad_norm": 0.7847695350646973, "learning_rate": 0.0002, "epoch": 3.967828418230563, "step": 2960}, {"loss": 1.4892, "grad_norm": 0.5845848321914673, "learning_rate": 0.0002, "epoch": 3.9812332439678286, "step": 2970}, {"loss": 1.5094, "grad_norm": 0.5861571431159973, "learning_rate": 0.0002, "epoch": 3.994638069705094, "step": 2980}, {"eval_loss": 1.8821998834609985, "eval_runtime": 90.8701, "eval_samples_per_second": 5.667, "eval_steps_per_second": 0.715, "epoch": 4.0, "step": 2984}, {"loss": 1.4156, "grad_norm": 0.6209918260574341, "learning_rate": 0.0002, "epoch": 4.008042895442359, "step": 2990}, {"loss": 1.4244, "grad_norm": 0.607226550579071, "learning_rate": 0.0002, "epoch": 4.021447721179625, "step": 3000}, {"loss": 1.3652, "grad_norm": 0.6677961349487305, "learning_rate": 0.0002, "epoch": 4.03485254691689, "step": 3010}, {"loss": 1.3815, "grad_norm": 0.9053248763084412, "learning_rate": 0.0002, "epoch": 4.048257372654155, "step": 3020}, {"loss": 1.4346, "grad_norm": 0.6815084218978882, "learning_rate": 0.0002, "epoch": 4.061662198391421, "step": 3030}, {"loss": 1.3, "grad_norm": 0.6709407567977905, "learning_rate": 0.0002, "epoch": 4.075067024128686, "step": 3040}, {"loss": 1.3406, "grad_norm": 0.728184163570404, "learning_rate": 0.0002, "epoch": 4.088471849865952, "step": 3050}, {"loss": 1.3404, "grad_norm": 0.817628800868988, "learning_rate": 0.0002, "epoch": 4.101876675603217, "step": 3060}, {"loss": 1.3496, "grad_norm": 0.7384206056594849, "learning_rate": 0.0002, "epoch": 4.115281501340482, "step": 3070}, {"loss": 1.3621, "grad_norm": 0.7380280494689941, "learning_rate": 0.0002, "epoch": 4.128686327077748, "step": 3080}, {"loss": 1.3425, "grad_norm": 0.8197277188301086, "learning_rate": 0.0002, "epoch": 4.142091152815014, "step": 3090}, {"loss": 1.3761, "grad_norm": 0.8971617817878723, "learning_rate": 0.0002, "epoch": 4.1554959785522785, "step": 3100}, {"loss": 1.3564, "grad_norm": 0.7409387826919556, "learning_rate": 0.0002, "epoch": 4.168900804289544, "step": 3110}, {"loss": 1.3675, "grad_norm": 0.6948909163475037, "learning_rate": 0.0002, "epoch": 4.18230563002681, "step": 3120}, {"loss": 1.3397, "grad_norm": 0.7619595527648926, "learning_rate": 0.0002, "epoch": 4.195710455764075, "step": 3130}, {"loss": 1.3864, "grad_norm": 0.7657106518745422, "learning_rate": 0.0002, "epoch": 4.20911528150134, "step": 3140}, {"loss": 1.4017, "grad_norm": 0.6919401288032532, "learning_rate": 0.0002, "epoch": 4.222520107238606, "step": 3150}, {"loss": 1.3692, "grad_norm": 0.6991415023803711, "learning_rate": 0.0002, "epoch": 4.2359249329758715, "step": 3160}, {"loss": 1.3651, "grad_norm": 0.7349252700805664, "learning_rate": 0.0002, "epoch": 4.249329758713137, "step": 3170}, {"loss": 1.367, "grad_norm": 0.8838240504264832, "learning_rate": 0.0002, "epoch": 4.262734584450402, "step": 3180}, {"loss": 1.4254, "grad_norm": 0.7240107655525208, "learning_rate": 0.0002, "epoch": 4.2761394101876675, "step": 3190}, {"loss": 1.3671, "grad_norm": 0.7338636517524719, "learning_rate": 0.0002, "epoch": 4.289544235924933, "step": 3200}, {"loss": 1.448, "grad_norm": 0.7891436815261841, "learning_rate": 0.0002, "epoch": 4.302949061662199, "step": 3210}, {"loss": 1.3291, "grad_norm": 0.7407845854759216, "learning_rate": 0.0002, "epoch": 4.316353887399464, "step": 3220}, {"loss": 1.3899, "grad_norm": 0.7635948061943054, "learning_rate": 0.0002, "epoch": 4.329758713136729, "step": 3230}, {"loss": 1.3384, "grad_norm": 0.7478461861610413, "learning_rate": 0.0002, "epoch": 4.343163538873995, "step": 3240}, {"loss": 1.388, "grad_norm": 0.7684298157691956, "learning_rate": 0.0002, "epoch": 4.35656836461126, "step": 3250}, {"loss": 1.4233, "grad_norm": 1.0287525653839111, "learning_rate": 0.0002, "epoch": 4.369973190348525, "step": 3260}, {"loss": 1.3542, "grad_norm": 0.750616192817688, "learning_rate": 0.0002, "epoch": 4.383378016085791, "step": 3270}, {"loss": 1.3158, "grad_norm": 0.7911648750305176, "learning_rate": 0.0002, "epoch": 4.396782841823057, "step": 3280}, {"loss": 1.3896, "grad_norm": 0.9156750440597534, "learning_rate": 0.0002, "epoch": 4.410187667560321, "step": 3290}, {"loss": 1.3887, "grad_norm": 1.0180249214172363, "learning_rate": 0.0002, "epoch": 4.423592493297587, "step": 3300}, {"loss": 1.4143, "grad_norm": 1.0792218446731567, "learning_rate": 0.0002, "epoch": 4.436997319034853, "step": 3310}, {"loss": 1.3314, "grad_norm": 0.8027488589286804, "learning_rate": 0.0002, "epoch": 4.450402144772118, "step": 3320}, {"loss": 1.4144, "grad_norm": 0.8037815093994141, "learning_rate": 0.0002, "epoch": 4.463806970509383, "step": 3330}, {"loss": 1.4124, "grad_norm": 0.7907946705818176, "learning_rate": 0.0002, "epoch": 4.477211796246649, "step": 3340}, {"loss": 1.443, "grad_norm": 0.7206302881240845, "learning_rate": 0.0002, "epoch": 4.490616621983914, "step": 3350}, {"loss": 1.3822, "grad_norm": 0.7697674632072449, "learning_rate": 0.0002, "epoch": 4.50402144772118, "step": 3360}, {"loss": 1.3923, "grad_norm": 0.7315130829811096, "learning_rate": 0.0002, "epoch": 4.517426273458445, "step": 3370}, {"loss": 1.3598, "grad_norm": 0.7896273136138916, "learning_rate": 0.0002, "epoch": 4.53083109919571, "step": 3380}, {"loss": 1.3947, "grad_norm": 0.7720345258712769, "learning_rate": 0.0002, "epoch": 4.544235924932976, "step": 3390}, {"loss": 1.404, "grad_norm": 0.8304631114006042, "learning_rate": 0.0002, "epoch": 4.557640750670242, "step": 3400}, {"loss": 1.3712, "grad_norm": 0.7408214211463928, "learning_rate": 0.0002, "epoch": 4.571045576407506, "step": 3410}, {"loss": 1.3957, "grad_norm": 0.8100157976150513, "learning_rate": 0.0002, "epoch": 4.584450402144772, "step": 3420}, {"loss": 1.47, "grad_norm": 0.7829574942588806, "learning_rate": 0.0002, "epoch": 4.597855227882038, "step": 3430}, {"loss": 1.3684, "grad_norm": 0.9529728889465332, "learning_rate": 0.0002, "epoch": 4.6112600536193025, "step": 3440}, {"loss": 1.3984, "grad_norm": 1.0769460201263428, "learning_rate": 0.0002, "epoch": 4.624664879356568, "step": 3450}, {"loss": 1.4063, "grad_norm": 0.8941947817802429, "learning_rate": 0.0002, "epoch": 4.638069705093834, "step": 3460}, {"loss": 1.4421, "grad_norm": 0.7860096096992493, "learning_rate": 0.0002, "epoch": 4.651474530831099, "step": 3470}, {"loss": 1.3782, "grad_norm": 0.8184044361114502, "learning_rate": 0.0002, "epoch": 4.664879356568365, "step": 3480}, {"loss": 1.3885, "grad_norm": 0.7852717638015747, "learning_rate": 0.0002, "epoch": 4.67828418230563, "step": 3490}, {"loss": 1.4139, "grad_norm": 0.750586986541748, "learning_rate": 0.0002, "epoch": 4.6916890080428955, "step": 3500}, {"loss": 1.3224, "grad_norm": 0.7966068983078003, "learning_rate": 0.0002, "epoch": 4.705093833780161, "step": 3510}, {"loss": 1.4052, "grad_norm": 0.8387030959129333, "learning_rate": 0.0002, "epoch": 4.718498659517426, "step": 3520}, {"loss": 1.4541, "grad_norm": 0.7373180389404297, "learning_rate": 0.0002, "epoch": 4.7319034852546915, "step": 3530}, {"loss": 1.4148, "grad_norm": 0.8415353894233704, "learning_rate": 0.0002, "epoch": 4.745308310991957, "step": 3540}, {"loss": 1.4236, "grad_norm": 0.7155488133430481, "learning_rate": 0.0002, "epoch": 4.758713136729223, "step": 3550}, {"loss": 1.3454, "grad_norm": 0.697658896446228, "learning_rate": 0.0002, "epoch": 4.772117962466488, "step": 3560}, {"loss": 1.4002, "grad_norm": 0.8722999095916748, "learning_rate": 0.0002, "epoch": 4.785522788203753, "step": 3570}, {"loss": 1.4224, "grad_norm": 0.8106381297111511, "learning_rate": 0.0002, "epoch": 4.798927613941019, "step": 3580}, {"loss": 1.3525, "grad_norm": 0.9320500493049622, "learning_rate": 0.0002, "epoch": 4.8123324396782845, "step": 3590}, {"loss": 1.3675, "grad_norm": 0.7583016157150269, "learning_rate": 0.0002, "epoch": 4.825737265415549, "step": 3600}, {"loss": 1.3761, "grad_norm": 0.790050208568573, "learning_rate": 0.0002, "epoch": 4.839142091152815, "step": 3610}, {"loss": 1.4144, "grad_norm": 0.7481580972671509, "learning_rate": 0.0002, "epoch": 4.8525469168900806, "step": 3620}, {"loss": 1.4424, "grad_norm": 0.8709374666213989, "learning_rate": 0.0002, "epoch": 4.865951742627346, "step": 3630}, {"loss": 1.3758, "grad_norm": 0.7266733050346375, "learning_rate": 0.0002, "epoch": 4.879356568364611, "step": 3640}, {"loss": 1.4254, "grad_norm": 0.7669504880905151, "learning_rate": 0.0002, "epoch": 4.892761394101877, "step": 3650}, {"loss": 1.3956, "grad_norm": 0.7855764627456665, "learning_rate": 0.0002, "epoch": 4.906166219839142, "step": 3660}, {"loss": 1.4609, "grad_norm": 0.8145440816879272, "learning_rate": 0.0002, "epoch": 4.919571045576408, "step": 3670}, {"loss": 1.4152, "grad_norm": 0.7487278580665588, "learning_rate": 0.0002, "epoch": 4.932975871313673, "step": 3680}, {"loss": 1.4386, "grad_norm": 0.8390981554985046, "learning_rate": 0.0002, "epoch": 4.946380697050938, "step": 3690}, {"loss": 1.3504, "grad_norm": 0.663752555847168, "learning_rate": 0.0002, "epoch": 4.959785522788204, "step": 3700}, {"loss": 1.3453, "grad_norm": 0.7821969985961914, "learning_rate": 0.0002, "epoch": 4.973190348525469, "step": 3710}, {"loss": 1.3936, "grad_norm": 0.9157266020774841, "learning_rate": 0.0002, "epoch": 4.986595174262734, "step": 3720}, {"loss": 1.3925, "grad_norm": 0.7683535814285278, "learning_rate": 0.0002, "epoch": 5.0, "step": 3730}, {"eval_loss": 1.9639414548873901, "eval_runtime": 92.0173, "eval_samples_per_second": 5.597, "eval_steps_per_second": 0.706, "epoch": 5.0, "step": 3730}, {"loss": 1.1852, "grad_norm": 1.3000373840332031, "learning_rate": 0.0002, "epoch": 5.013404825737266, "step": 3740}, {"loss": 1.1922, "grad_norm": 0.8916982412338257, "learning_rate": 0.0002, "epoch": 5.02680965147453, "step": 3750}, {"loss": 1.2113, "grad_norm": 1.0365116596221924, "learning_rate": 0.0002, "epoch": 5.040214477211796, "step": 3760}, {"loss": 1.2941, "grad_norm": 0.999420166015625, "learning_rate": 0.0002, "epoch": 5.053619302949062, "step": 3770}, {"loss": 1.24, "grad_norm": 1.093572974205017, "learning_rate": 0.0002, "epoch": 5.067024128686327, "step": 3780}, {"loss": 1.2345, "grad_norm": 1.1137515306472778, "learning_rate": 0.0002, "epoch": 5.080428954423592, "step": 3790}, {"loss": 1.1646, "grad_norm": 1.0328283309936523, "learning_rate": 0.0002, "epoch": 5.093833780160858, "step": 3800}, {"loss": 1.1716, "grad_norm": 1.0444108247756958, "learning_rate": 0.0002, "epoch": 5.107238605898123, "step": 3810}, {"loss": 1.2226, "grad_norm": 0.858148992061615, "learning_rate": 0.0002, "epoch": 5.120643431635389, "step": 3820}, {"loss": 1.1691, "grad_norm": 0.94026780128479, "learning_rate": 0.0002, "epoch": 5.134048257372654, "step": 3830}, {"loss": 1.1902, "grad_norm": 0.8987152576446533, "learning_rate": 0.0002, "epoch": 5.1474530831099194, "step": 3840}, {"loss": 1.1562, "grad_norm": 0.922997236251831, "learning_rate": 0.0002, "epoch": 5.160857908847185, "step": 3850}, {"loss": 1.2072, "grad_norm": 0.9172422289848328, "learning_rate": 0.0002, "epoch": 5.174262734584451, "step": 3860}, {"loss": 1.1802, "grad_norm": 1.02277672290802, "learning_rate": 0.0002, "epoch": 5.1876675603217155, "step": 3870}, {"loss": 1.2206, "grad_norm": 1.093826413154602, "learning_rate": 0.0002, "epoch": 5.201072386058981, "step": 3880}, {"loss": 1.2578, "grad_norm": 0.9362447261810303, "learning_rate": 0.0002, "epoch": 5.214477211796247, "step": 3890}, {"loss": 1.2335, "grad_norm": 1.0564044713974, "learning_rate": 0.0002, "epoch": 5.227882037533512, "step": 3900}, {"loss": 1.1936, "grad_norm": 0.869575023651123, "learning_rate": 0.0002, "epoch": 5.241286863270777, "step": 3910}, {"loss": 1.2301, "grad_norm": 1.0383203029632568, "learning_rate": 0.0002, "epoch": 5.254691689008043, "step": 3920}, {"loss": 1.2076, "grad_norm": 0.9146919846534729, "learning_rate": 0.0002, "epoch": 5.2680965147453085, "step": 3930}, {"loss": 1.2804, "grad_norm": 0.9226430654525757, "learning_rate": 0.0002, "epoch": 5.281501340482574, "step": 3940}, {"loss": 1.2506, "grad_norm": 0.8703194260597229, "learning_rate": 0.0002, "epoch": 5.294906166219839, "step": 3950}, {"loss": 1.2533, "grad_norm": 1.0588284730911255, "learning_rate": 0.0002, "epoch": 5.3083109919571045, "step": 3960}, {"loss": 1.2405, "grad_norm": 1.1131688356399536, "learning_rate": 0.0002, "epoch": 5.32171581769437, "step": 3970}, {"loss": 1.1719, "grad_norm": 1.1073139905929565, "learning_rate": 0.0002, "epoch": 5.335120643431635, "step": 3980}, {"loss": 1.2375, "grad_norm": 0.9269049763679504, "learning_rate": 0.0002, "epoch": 5.348525469168901, "step": 3990}, {"loss": 1.2513, "grad_norm": 0.9802212715148926, "learning_rate": 0.0002, "epoch": 5.361930294906166, "step": 4000}, {"loss": 1.1573, "grad_norm": 0.9152148365974426, "learning_rate": 0.0002, "epoch": 5.375335120643432, "step": 4010}, {"loss": 1.2673, "grad_norm": 1.0395890474319458, "learning_rate": 0.0002, "epoch": 5.388739946380697, "step": 4020}, {"loss": 1.2228, "grad_norm": 1.0989106893539429, "learning_rate": 0.0002, "epoch": 5.402144772117962, "step": 4030}, {"loss": 1.2717, "grad_norm": 1.0305225849151611, "learning_rate": 0.0002, "epoch": 5.415549597855228, "step": 4040}, {"loss": 1.2751, "grad_norm": 0.8416915535926819, "learning_rate": 0.0002, "epoch": 5.428954423592494, "step": 4050}, {"loss": 1.2205, "grad_norm": 0.9120758175849915, "learning_rate": 0.0002, "epoch": 5.442359249329758, "step": 4060}, {"loss": 1.2812, "grad_norm": 1.197936773300171, "learning_rate": 0.0002, "epoch": 5.455764075067024, "step": 4070}, {"loss": 1.2346, "grad_norm": 1.0116125345230103, "learning_rate": 0.0002, "epoch": 5.46916890080429, "step": 4080}, {"loss": 1.1746, "grad_norm": 1.048995018005371, "learning_rate": 0.0002, "epoch": 5.482573726541555, "step": 4090}, {"loss": 1.1858, "grad_norm": 0.929185152053833, "learning_rate": 0.0002, "epoch": 5.49597855227882, "step": 4100}, {"loss": 1.3068, "grad_norm": 0.9064884781837463, "learning_rate": 0.0002, "epoch": 5.509383378016086, "step": 4110}, {"loss": 1.2481, "grad_norm": 1.2009892463684082, "learning_rate": 0.0002, "epoch": 5.522788203753351, "step": 4120}, {"loss": 1.2788, "grad_norm": 0.9054455161094666, "learning_rate": 0.0002, "epoch": 5.536193029490617, "step": 4130}, {"loss": 1.1624, "grad_norm": 0.9978497624397278, "learning_rate": 0.0002, "epoch": 5.549597855227882, "step": 4140}, {"loss": 1.2814, "grad_norm": 0.9779615998268127, "learning_rate": 0.0002, "epoch": 5.563002680965147, "step": 4150}, {"loss": 1.2361, "grad_norm": 1.0515185594558716, "learning_rate": 0.0002, "epoch": 5.576407506702413, "step": 4160}, {"loss": 1.2278, "grad_norm": 0.8618236184120178, "learning_rate": 0.0002, "epoch": 5.589812332439678, "step": 4170}, {"loss": 1.2853, "grad_norm": 0.9569384455680847, "learning_rate": 0.0002, "epoch": 5.603217158176943, "step": 4180}, {"loss": 1.2824, "grad_norm": 0.968923807144165, "learning_rate": 0.0002, "epoch": 5.616621983914209, "step": 4190}, {"loss": 1.3055, "grad_norm": 0.8759993314743042, "learning_rate": 0.0002, "epoch": 5.630026809651475, "step": 4200}, {"loss": 1.2912, "grad_norm": 0.9284833669662476, "learning_rate": 0.0002, "epoch": 5.64343163538874, "step": 4210}, {"loss": 1.2886, "grad_norm": 0.9293071031570435, "learning_rate": 0.0002, "epoch": 5.656836461126005, "step": 4220}, {"loss": 1.2704, "grad_norm": 0.9872161149978638, "learning_rate": 0.0002, "epoch": 5.670241286863271, "step": 4230}, {"loss": 1.2525, "grad_norm": 0.9545941948890686, "learning_rate": 0.0002, "epoch": 5.683646112600536, "step": 4240}, {"loss": 1.2639, "grad_norm": 1.0202341079711914, "learning_rate": 0.0002, "epoch": 5.697050938337801, "step": 4250}, {"loss": 1.2259, "grad_norm": 0.9821504950523376, "learning_rate": 0.0002, "epoch": 5.710455764075067, "step": 4260}, {"loss": 1.2243, "grad_norm": 1.0581456422805786, "learning_rate": 0.0002, "epoch": 5.7238605898123325, "step": 4270}, {"loss": 1.227, "grad_norm": 0.9639395475387573, "learning_rate": 0.0002, "epoch": 5.737265415549598, "step": 4280}, {"loss": 1.2849, "grad_norm": 2.205458164215088, "learning_rate": 0.0002, "epoch": 5.750670241286863, "step": 4290}, {"loss": 1.2785, "grad_norm": 1.0294393301010132, "learning_rate": 0.0002, "epoch": 5.7640750670241285, "step": 4300}, {"loss": 1.261, "grad_norm": 1.0360256433486938, "learning_rate": 0.0002, "epoch": 5.777479892761394, "step": 4310}, {"loss": 1.2891, "grad_norm": 0.9390154480934143, "learning_rate": 0.0002, "epoch": 5.79088471849866, "step": 4320}, {"loss": 1.248, "grad_norm": 0.9048963189125061, "learning_rate": 0.0002, "epoch": 5.804289544235925, "step": 4330}, {"loss": 1.2753, "grad_norm": 0.9310713410377502, "learning_rate": 0.0002, "epoch": 5.81769436997319, "step": 4340}, {"loss": 1.2393, "grad_norm": 1.038282871246338, "learning_rate": 0.0002, "epoch": 5.831099195710456, "step": 4350}, {"loss": 1.3398, "grad_norm": 0.9194827079772949, "learning_rate": 0.0002, "epoch": 5.8445040214477215, "step": 4360}, {"loss": 1.3049, "grad_norm": 0.9568411111831665, "learning_rate": 0.0002, "epoch": 5.857908847184986, "step": 4370}, {"loss": 1.2899, "grad_norm": 0.9088910818099976, "learning_rate": 0.0002, "epoch": 5.871313672922252, "step": 4380}, {"loss": 1.2497, "grad_norm": 1.0605647563934326, "learning_rate": 0.0002, "epoch": 5.884718498659518, "step": 4390}, {"loss": 1.2387, "grad_norm": 0.8016388416290283, "learning_rate": 0.0002, "epoch": 5.898123324396783, "step": 4400}, {"loss": 1.3046, "grad_norm": 1.0792853832244873, "learning_rate": 0.0002, "epoch": 5.911528150134048, "step": 4410}, {"loss": 1.282, "grad_norm": 1.059403657913208, "learning_rate": 0.0002, "epoch": 5.924932975871314, "step": 4420}, {"loss": 1.2524, "grad_norm": 0.87492436170578, "learning_rate": 0.0002, "epoch": 5.938337801608579, "step": 4430}, {"loss": 1.2373, "grad_norm": 1.0911097526550293, "learning_rate": 0.0002, "epoch": 5.951742627345844, "step": 4440}, {"loss": 1.3073, "grad_norm": 0.8860997557640076, "learning_rate": 0.0002, "epoch": 5.96514745308311, "step": 4450}, {"loss": 1.3273, "grad_norm": 0.9176826477050781, "learning_rate": 0.0002, "epoch": 5.978552278820375, "step": 4460}, {"loss": 1.2725, "grad_norm": 0.9018680453300476, "learning_rate": 0.0002, "epoch": 5.991957104557641, "step": 4470}, {"eval_loss": 2.0600433349609375, "eval_runtime": 92.2728, "eval_samples_per_second": 5.581, "eval_steps_per_second": 0.704, "epoch": 6.0, "step": 4476}, {"loss": 1.2019, "grad_norm": 0.8612148761749268, "learning_rate": 0.0002, "epoch": 6.005361930294906, "step": 4480}, {"loss": 1.1005, "grad_norm": 1.170229434967041, "learning_rate": 0.0002, "epoch": 6.018766756032171, "step": 4490}, {"loss": 1.0129, "grad_norm": 1.1005233526229858, "learning_rate": 0.0002, "epoch": 6.032171581769437, "step": 4500}, {"loss": 1.0936, "grad_norm": 1.1763442754745483, "learning_rate": 0.0002, "epoch": 6.045576407506703, "step": 4510}, {"loss": 0.9865, "grad_norm": 1.0595353841781616, "learning_rate": 0.0002, "epoch": 6.058981233243967, "step": 4520}, {"loss": 0.9543, "grad_norm": 1.3554084300994873, "learning_rate": 0.0002, "epoch": 6.072386058981233, "step": 4530}, {"loss": 1.0619, "grad_norm": 1.238821268081665, "learning_rate": 0.0002, "epoch": 6.085790884718499, "step": 4540}, {"loss": 1.0951, "grad_norm": 1.0496071577072144, "learning_rate": 0.0002, "epoch": 6.099195710455764, "step": 4550}, {"loss": 1.1128, "grad_norm": 1.3410215377807617, "learning_rate": 0.0002, "epoch": 6.112600536193029, "step": 4560}, {"loss": 1.0824, "grad_norm": 1.2559033632278442, "learning_rate": 0.0002, "epoch": 6.126005361930295, "step": 4570}, {"loss": 1.0645, "grad_norm": 1.2556545734405518, "learning_rate": 0.0002, "epoch": 6.13941018766756, "step": 4580}, {"loss": 1.1219, "grad_norm": 1.050678014755249, "learning_rate": 0.0002, "epoch": 6.152815013404826, "step": 4590}, {"loss": 1.0421, "grad_norm": 1.566770076751709, "learning_rate": 0.0002, "epoch": 6.166219839142091, "step": 4600}, {"loss": 1.0617, "grad_norm": 1.1482226848602295, "learning_rate": 0.0002, "epoch": 6.1796246648793565, "step": 4610}, {"loss": 1.0477, "grad_norm": 1.2731150388717651, "learning_rate": 0.0002, "epoch": 6.193029490616622, "step": 4620}, {"loss": 1.0291, "grad_norm": 1.4135994911193848, "learning_rate": 0.0002, "epoch": 6.206434316353888, "step": 4630}, {"loss": 1.0666, "grad_norm": 1.2925093173980713, "learning_rate": 0.0002, "epoch": 6.2198391420911525, "step": 4640}, {"loss": 1.0657, "grad_norm": 1.1199861764907837, "learning_rate": 0.0002, "epoch": 6.233243967828418, "step": 4650}, {"loss": 1.1143, "grad_norm": 1.2010078430175781, "learning_rate": 0.0002, "epoch": 6.246648793565684, "step": 4660}, {"loss": 1.1186, "grad_norm": 1.2655692100524902, "learning_rate": 0.0002, "epoch": 6.2600536193029495, "step": 4670}, {"loss": 1.0276, "grad_norm": 1.0960880517959595, "learning_rate": 0.0002, "epoch": 6.273458445040214, "step": 4680}, {"loss": 1.0576, "grad_norm": 1.170759916305542, "learning_rate": 0.0002, "epoch": 6.28686327077748, "step": 4690}, {"loss": 1.0852, "grad_norm": 1.1199755668640137, "learning_rate": 0.0002, "epoch": 6.3002680965147455, "step": 4700}, {"loss": 1.0171, "grad_norm": 1.1477710008621216, "learning_rate": 0.0002, "epoch": 6.31367292225201, "step": 4710}, {"loss": 1.0411, "grad_norm": 1.0862090587615967, "learning_rate": 0.0002, "epoch": 6.327077747989276, "step": 4720}, {"loss": 1.0299, "grad_norm": 1.1428112983703613, "learning_rate": 0.0002, "epoch": 6.340482573726542, "step": 4730}, {"loss": 1.0988, "grad_norm": 1.155534029006958, "learning_rate": 0.0002, "epoch": 6.353887399463807, "step": 4740}, {"loss": 1.1134, "grad_norm": 1.2997788190841675, "learning_rate": 0.0002, "epoch": 6.367292225201073, "step": 4750}, {"loss": 1.1386, "grad_norm": 1.1087043285369873, "learning_rate": 0.0002, "epoch": 6.380697050938338, "step": 4760}, {"loss": 1.0266, "grad_norm": 1.3957210779190063, "learning_rate": 0.0002, "epoch": 6.394101876675603, "step": 4770}, {"loss": 1.0803, "grad_norm": 1.1346395015716553, "learning_rate": 0.0002, "epoch": 6.407506702412869, "step": 4780}, {"loss": 1.0686, "grad_norm": 1.3830486536026, "learning_rate": 0.0002, "epoch": 6.420911528150134, "step": 4790}, {"loss": 1.138, "grad_norm": 1.1137559413909912, "learning_rate": 0.0002, "epoch": 6.434316353887399, "step": 4800}, {"loss": 1.0863, "grad_norm": 1.151821494102478, "learning_rate": 0.0002, "epoch": 6.447721179624665, "step": 4810}, {"loss": 1.0821, "grad_norm": 1.122589111328125, "learning_rate": 0.0002, "epoch": 6.461126005361931, "step": 4820}, {"loss": 1.1308, "grad_norm": 1.2847239971160889, "learning_rate": 0.0002, "epoch": 6.474530831099195, "step": 4830}, {"loss": 1.1001, "grad_norm": 1.027617335319519, "learning_rate": 0.0002, "epoch": 6.487935656836461, "step": 4840}, {"loss": 1.102, "grad_norm": 1.3375194072723389, "learning_rate": 0.0002, "epoch": 6.501340482573727, "step": 4850}, {"loss": 1.1055, "grad_norm": 1.1723220348358154, "learning_rate": 0.0002, "epoch": 6.514745308310992, "step": 4860}, {"loss": 1.129, "grad_norm": 1.7034224271774292, "learning_rate": 0.0002, "epoch": 6.528150134048257, "step": 4870}, {"loss": 1.0544, "grad_norm": 1.0840927362442017, "learning_rate": 0.0002, "epoch": 6.541554959785523, "step": 4880}, {"loss": 1.1194, "grad_norm": 1.3088481426239014, "learning_rate": 0.0002, "epoch": 6.554959785522788, "step": 4890}, {"loss": 1.1513, "grad_norm": 1.1394107341766357, "learning_rate": 0.0002, "epoch": 6.568364611260054, "step": 4900}, {"loss": 1.0796, "grad_norm": 1.0243184566497803, "learning_rate": 0.0002, "epoch": 6.581769436997319, "step": 4910}, {"loss": 1.2096, "grad_norm": 1.0814571380615234, "learning_rate": 0.0002, "epoch": 6.595174262734584, "step": 4920}, {"loss": 1.1279, "grad_norm": 1.1652323007583618, "learning_rate": 0.0002, "epoch": 6.60857908847185, "step": 4930}, {"loss": 1.186, "grad_norm": 1.0203579664230347, "learning_rate": 0.0002, "epoch": 6.621983914209116, "step": 4940}, {"loss": 1.1243, "grad_norm": 1.3823212385177612, "learning_rate": 0.0002, "epoch": 6.6353887399463805, "step": 4950}, {"loss": 1.1464, "grad_norm": 1.248955488204956, "learning_rate": 0.0002, "epoch": 6.648793565683646, "step": 4960}, {"loss": 1.1278, "grad_norm": 1.2215739488601685, "learning_rate": 0.0002, "epoch": 6.662198391420912, "step": 4970}, {"loss": 1.1109, "grad_norm": 1.307869553565979, "learning_rate": 0.0002, "epoch": 6.6756032171581765, "step": 4980}, {"loss": 1.1738, "grad_norm": 1.4434916973114014, "learning_rate": 0.0002, "epoch": 6.689008042895442, "step": 4990}, {"loss": 1.1068, "grad_norm": 1.1840227842330933, "learning_rate": 0.0002, "epoch": 6.702412868632708, "step": 5000}, {"loss": 1.1738, "grad_norm": 1.1775435209274292, "learning_rate": 0.0002, "epoch": 6.7158176943699734, "step": 5010}, {"loss": 1.114, "grad_norm": 1.1639968156814575, "learning_rate": 0.0002, "epoch": 6.729222520107239, "step": 5020}, {"loss": 1.1363, "grad_norm": 1.3774648904800415, "learning_rate": 0.0002, "epoch": 6.742627345844504, "step": 5030}, {"loss": 1.095, "grad_norm": 1.0328693389892578, "learning_rate": 0.0002, "epoch": 6.7560321715817695, "step": 5040}, {"loss": 1.1371, "grad_norm": 1.0495599508285522, "learning_rate": 0.0002, "epoch": 6.769436997319035, "step": 5050}, {"loss": 1.1728, "grad_norm": 1.3220133781433105, "learning_rate": 0.0002, "epoch": 6.7828418230563, "step": 5060}, {"loss": 1.13, "grad_norm": 1.3658279180526733, "learning_rate": 0.0002, "epoch": 6.7962466487935655, "step": 5070}, {"loss": 1.0755, "grad_norm": 1.3788504600524902, "learning_rate": 0.0002, "epoch": 6.809651474530831, "step": 5080}, {"loss": 1.1331, "grad_norm": 1.2342770099639893, "learning_rate": 0.0002, "epoch": 6.823056300268097, "step": 5090}, {"loss": 1.1761, "grad_norm": 1.3752578496932983, "learning_rate": 0.0002, "epoch": 6.836461126005362, "step": 5100}, {"loss": 1.078, "grad_norm": 1.0902243852615356, "learning_rate": 0.0002, "epoch": 6.849865951742627, "step": 5110}, {"loss": 1.1613, "grad_norm": 1.2125890254974365, "learning_rate": 0.0002, "epoch": 6.863270777479893, "step": 5120}, {"loss": 1.1651, "grad_norm": 1.2979270219802856, "learning_rate": 0.0002, "epoch": 6.8766756032171585, "step": 5130}, {"loss": 1.1207, "grad_norm": 1.2894749641418457, "learning_rate": 0.0002, "epoch": 6.890080428954423, "step": 5140}, {"loss": 1.1143, "grad_norm": 1.4804800748825073, "learning_rate": 0.0002, "epoch": 6.903485254691689, "step": 5150}, {"loss": 1.1245, "grad_norm": 1.1119170188903809, "learning_rate": 0.0002, "epoch": 6.916890080428955, "step": 5160}, {"loss": 1.1135, "grad_norm": 1.4991406202316284, "learning_rate": 0.0002, "epoch": 6.930294906166219, "step": 5170}, {"loss": 1.1025, "grad_norm": 1.2187672853469849, "learning_rate": 0.0002, "epoch": 6.943699731903485, "step": 5180}, {"loss": 1.1991, "grad_norm": 1.2419520616531372, "learning_rate": 0.0002, "epoch": 6.957104557640751, "step": 5190}, {"loss": 1.1231, "grad_norm": 1.359859585762024, "learning_rate": 0.0002, "epoch": 6.970509383378016, "step": 5200}, {"loss": 1.0882, "grad_norm": 1.3679486513137817, "learning_rate": 0.0002, "epoch": 6.983914209115282, "step": 5210}, {"loss": 1.1856, "grad_norm": 1.2109483480453491, "learning_rate": 0.0002, "epoch": 6.997319034852547, "step": 5220}, {"eval_loss": 2.194319725036621, "eval_runtime": 93.0187, "eval_samples_per_second": 5.537, "eval_steps_per_second": 0.699, "epoch": 7.0, "step": 5222}, {"loss": 0.9569, "grad_norm": 1.1413990259170532, "learning_rate": 0.0002, "epoch": 7.010723860589812, "step": 5230}, {"loss": 0.8378, "grad_norm": 1.228061556816101, "learning_rate": 0.0002, "epoch": 7.024128686327078, "step": 5240}, {"loss": 0.945, "grad_norm": 1.4723389148712158, "learning_rate": 0.0002, "epoch": 7.037533512064343, "step": 5250}, {"loss": 0.9419, "grad_norm": 1.6016414165496826, "learning_rate": 0.0002, "epoch": 7.050938337801608, "step": 5260}, {"loss": 0.8133, "grad_norm": 1.173973798751831, "learning_rate": 0.0002, "epoch": 7.064343163538874, "step": 5270}, {"loss": 0.9426, "grad_norm": 1.7001465559005737, "learning_rate": 0.0002, "epoch": 7.07774798927614, "step": 5280}, {"loss": 0.9189, "grad_norm": 1.5025922060012817, "learning_rate": 0.0002, "epoch": 7.091152815013404, "step": 5290}, {"loss": 0.9106, "grad_norm": 1.3865472078323364, "learning_rate": 0.0002, "epoch": 7.10455764075067, "step": 5300}, {"loss": 0.9039, "grad_norm": 1.4111610651016235, "learning_rate": 0.0002, "epoch": 7.117962466487936, "step": 5310}, {"loss": 0.8982, "grad_norm": 1.3427162170410156, "learning_rate": 0.0002, "epoch": 7.131367292225201, "step": 5320}, {"loss": 0.9665, "grad_norm": 1.592889428138733, "learning_rate": 0.0002, "epoch": 7.144772117962466, "step": 5330}, {"loss": 0.9277, "grad_norm": 1.2716485261917114, "learning_rate": 0.0002, "epoch": 7.158176943699732, "step": 5340}, {"loss": 0.932, "grad_norm": 1.3858015537261963, "learning_rate": 0.0002, "epoch": 7.171581769436997, "step": 5350}, {"loss": 0.9313, "grad_norm": 1.4250117540359497, "learning_rate": 0.0002, "epoch": 7.184986595174263, "step": 5360}, {"loss": 0.908, "grad_norm": 1.5094358921051025, "learning_rate": 0.0002, "epoch": 7.198391420911528, "step": 5370}, {"loss": 0.9656, "grad_norm": 1.299795150756836, "learning_rate": 0.0002, "epoch": 7.2117962466487935, "step": 5380}, {"loss": 0.9416, "grad_norm": 1.4491885900497437, "learning_rate": 0.0002, "epoch": 7.225201072386059, "step": 5390}, {"loss": 0.9136, "grad_norm": 1.4907571077346802, "learning_rate": 0.0002, "epoch": 7.238605898123325, "step": 5400}, {"loss": 0.8973, "grad_norm": 1.3086504936218262, "learning_rate": 0.0002, "epoch": 7.2520107238605895, "step": 5410}, {"loss": 0.984, "grad_norm": 1.2242939472198486, "learning_rate": 0.0002, "epoch": 7.265415549597855, "step": 5420}, {"loss": 0.9271, "grad_norm": 1.4723531007766724, "learning_rate": 0.0002, "epoch": 7.278820375335121, "step": 5430}, {"loss": 0.9531, "grad_norm": 1.3514219522476196, "learning_rate": 0.0002, "epoch": 7.292225201072386, "step": 5440}, {"loss": 1.0067, "grad_norm": 1.484549641609192, "learning_rate": 0.0002, "epoch": 7.305630026809651, "step": 5450}, {"loss": 0.9408, "grad_norm": 1.4641015529632568, "learning_rate": 0.0002, "epoch": 7.319034852546917, "step": 5460}, {"loss": 0.946, "grad_norm": 1.4476960897445679, "learning_rate": 0.0002, "epoch": 7.3324396782841825, "step": 5470}, {"loss": 0.927, "grad_norm": 1.5155150890350342, "learning_rate": 0.0002, "epoch": 7.345844504021448, "step": 5480}, {"loss": 0.9945, "grad_norm": 1.4297741651535034, "learning_rate": 0.0002, "epoch": 7.359249329758713, "step": 5490}, {"loss": 0.9897, "grad_norm": 1.5957597494125366, "learning_rate": 0.0002, "epoch": 7.372654155495979, "step": 5500}, {"loss": 0.9501, "grad_norm": 1.4234981536865234, "learning_rate": 0.0002, "epoch": 7.386058981233244, "step": 5510}, {"loss": 0.9248, "grad_norm": 1.4279195070266724, "learning_rate": 0.0002, "epoch": 7.399463806970509, "step": 5520}, {"loss": 0.9324, "grad_norm": 1.2789702415466309, "learning_rate": 0.0002, "epoch": 7.412868632707775, "step": 5530}, {"loss": 0.9614, "grad_norm": 1.3967640399932861, "learning_rate": 0.0002, "epoch": 7.42627345844504, "step": 5540}, {"loss": 0.9622, "grad_norm": 1.4384145736694336, "learning_rate": 0.0002, "epoch": 7.439678284182306, "step": 5550}, {"loss": 0.8888, "grad_norm": 1.2486642599105835, "learning_rate": 0.0002, "epoch": 7.453083109919571, "step": 5560}, {"loss": 0.9768, "grad_norm": 1.433598279953003, "learning_rate": 0.0002, "epoch": 7.466487935656836, "step": 5570}, {"loss": 0.9954, "grad_norm": 1.2411381006240845, "learning_rate": 0.0002, "epoch": 7.479892761394102, "step": 5580}, {"loss": 1.0025, "grad_norm": 1.5211423635482788, "learning_rate": 0.0002, "epoch": 7.493297587131368, "step": 5590}, {"loss": 0.996, "grad_norm": 1.916807770729065, "learning_rate": 0.0002, "epoch": 7.506702412868632, "step": 5600}, {"loss": 0.9944, "grad_norm": 1.1726218461990356, "learning_rate": 0.0002, "epoch": 7.520107238605898, "step": 5610}, {"loss": 0.9693, "grad_norm": 1.4437224864959717, "learning_rate": 0.0002, "epoch": 7.533512064343164, "step": 5620}, {"loss": 0.9574, "grad_norm": 1.3450417518615723, "learning_rate": 0.0002, "epoch": 7.546916890080429, "step": 5630}, {"loss": 0.9837, "grad_norm": 1.369955062866211, "learning_rate": 0.0002, "epoch": 7.560321715817694, "step": 5640}, {"loss": 0.985, "grad_norm": 1.323500394821167, "learning_rate": 0.0002, "epoch": 7.57372654155496, "step": 5650}, {"loss": 0.9351, "grad_norm": 1.4024254083633423, "learning_rate": 0.0002, "epoch": 7.587131367292225, "step": 5660}, {"loss": 0.9277, "grad_norm": 1.5177226066589355, "learning_rate": 0.0002, "epoch": 7.600536193029491, "step": 5670}, {"loss": 1.0089, "grad_norm": 1.3379560708999634, "learning_rate": 0.0002, "epoch": 7.613941018766756, "step": 5680}, {"loss": 0.9919, "grad_norm": 1.3165442943572998, "learning_rate": 0.0002, "epoch": 7.627345844504021, "step": 5690}, {"loss": 1.024, "grad_norm": 1.4175701141357422, "learning_rate": 0.0002, "epoch": 7.640750670241287, "step": 5700}, {"loss": 0.9237, "grad_norm": 1.531698226928711, "learning_rate": 0.0002, "epoch": 7.654155495978552, "step": 5710}, {"loss": 1.0119, "grad_norm": 1.3139971494674683, "learning_rate": 0.0002, "epoch": 7.6675603217158175, "step": 5720}, {"loss": 0.9301, "grad_norm": 1.4163814783096313, "learning_rate": 0.0002, "epoch": 7.680965147453083, "step": 5730}, {"loss": 0.9794, "grad_norm": 1.4500303268432617, "learning_rate": 0.0002, "epoch": 7.694369973190349, "step": 5740}, {"loss": 0.9983, "grad_norm": 1.2513974905014038, "learning_rate": 0.0002, "epoch": 7.707774798927614, "step": 5750}, {"loss": 0.9432, "grad_norm": 1.6025257110595703, "learning_rate": 0.0002, "epoch": 7.721179624664879, "step": 5760}, {"loss": 0.9981, "grad_norm": 1.4038569927215576, "learning_rate": 0.0002, "epoch": 7.734584450402145, "step": 5770}, {"loss": 0.9435, "grad_norm": 1.464080572128296, "learning_rate": 0.0002, "epoch": 7.7479892761394105, "step": 5780}, {"loss": 0.974, "grad_norm": 1.51055908203125, "learning_rate": 0.0002, "epoch": 7.761394101876675, "step": 5790}, {"loss": 0.9887, "grad_norm": 1.4638031721115112, "learning_rate": 0.0002, "epoch": 7.774798927613941, "step": 5800}, {"loss": 1.0179, "grad_norm": 1.274057388305664, "learning_rate": 0.0002, "epoch": 7.7882037533512065, "step": 5810}, {"loss": 0.9756, "grad_norm": 1.4633456468582153, "learning_rate": 0.0002, "epoch": 7.801608579088472, "step": 5820}, {"loss": 1.0536, "grad_norm": 1.3144497871398926, "learning_rate": 0.0002, "epoch": 7.815013404825737, "step": 5830}, {"loss": 1.0058, "grad_norm": 1.496511459350586, "learning_rate": 0.0002, "epoch": 7.828418230563003, "step": 5840}, {"loss": 1.0064, "grad_norm": 1.603127360343933, "learning_rate": 0.0002, "epoch": 7.841823056300268, "step": 5850}, {"loss": 1.0116, "grad_norm": 1.376160979270935, "learning_rate": 0.0002, "epoch": 7.855227882037534, "step": 5860}, {"loss": 1.0103, "grad_norm": 1.9300047159194946, "learning_rate": 0.0002, "epoch": 7.868632707774799, "step": 5870}, {"loss": 1.044, "grad_norm": 1.5328046083450317, "learning_rate": 0.0002, "epoch": 7.882037533512064, "step": 5880}, {"loss": 1.022, "grad_norm": 1.4844473600387573, "learning_rate": 0.0002, "epoch": 7.89544235924933, "step": 5890}, {"loss": 1.0594, "grad_norm": 1.3647412061691284, "learning_rate": 0.0002, "epoch": 7.908847184986596, "step": 5900}, {"loss": 0.9822, "grad_norm": 1.4157295227050781, "learning_rate": 0.0002, "epoch": 7.92225201072386, "step": 5910}, {"loss": 0.9722, "grad_norm": 1.4677143096923828, "learning_rate": 0.0002, "epoch": 7.935656836461126, "step": 5920}, {"loss": 0.9871, "grad_norm": 1.322703242301941, "learning_rate": 0.0002, "epoch": 7.949061662198392, "step": 5930}, {"loss": 1.0684, "grad_norm": 1.1980623006820679, "learning_rate": 0.0002, "epoch": 7.962466487935657, "step": 5940}, {"loss": 0.9723, "grad_norm": 1.3701993227005005, "learning_rate": 0.0002, "epoch": 7.975871313672922, "step": 5950}, {"loss": 1.0442, "grad_norm": 1.4934145212173462, "learning_rate": 0.0002, "epoch": 7.989276139410188, "step": 5960}]}